transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -37,6 +37,7 @@ from .configuration_auto import (
|
|
|
37
37
|
from .feature_extraction_auto import AutoFeatureExtractor
|
|
38
38
|
from .image_processing_auto import AutoImageProcessor
|
|
39
39
|
from .tokenization_auto import AutoTokenizer
|
|
40
|
+
from .video_processing_auto import AutoVideoProcessor
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
logger = logging.get_logger(__name__)
|
|
@@ -147,6 +148,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
|
|
147
148
|
("speech_to_text", "Speech2TextProcessor"),
|
|
148
149
|
("speecht5", "SpeechT5Processor"),
|
|
149
150
|
("t5gemma2", "Gemma3Processor"),
|
|
151
|
+
("t5gemma2_encoder", "Gemma3Processor"),
|
|
150
152
|
("trocr", "TrOCRProcessor"),
|
|
151
153
|
("tvp", "TvpProcessor"),
|
|
152
154
|
("udop", "UdopProcessor"),
|
|
@@ -400,31 +402,20 @@ class AutoProcessor:
|
|
|
400
402
|
elif type(config) in PROCESSOR_MAPPING:
|
|
401
403
|
return PROCESSOR_MAPPING[type(config)].from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
402
404
|
|
|
403
|
-
# At this stage, there doesn't seem to be a `Processor` class available for this model
|
|
404
|
-
#
|
|
405
|
-
|
|
406
|
-
return AutoTokenizer.from_pretrained(
|
|
407
|
-
pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
|
|
408
|
-
)
|
|
409
|
-
except Exception:
|
|
410
|
-
try:
|
|
411
|
-
return AutoImageProcessor.from_pretrained(
|
|
412
|
-
pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
|
|
413
|
-
)
|
|
414
|
-
except Exception:
|
|
415
|
-
pass
|
|
416
|
-
|
|
405
|
+
# At this stage, there doesn't seem to be a `Processor` class available for this model.
|
|
406
|
+
# Let's try the commonly available classes
|
|
407
|
+
for klass in (AutoTokenizer, AutoImageProcessor, AutoVideoProcessor, AutoFeatureExtractor):
|
|
417
408
|
try:
|
|
418
|
-
return
|
|
409
|
+
return klass.from_pretrained(
|
|
419
410
|
pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
|
|
420
411
|
)
|
|
421
412
|
except Exception:
|
|
422
|
-
|
|
413
|
+
continue
|
|
423
414
|
|
|
424
415
|
raise ValueError(
|
|
425
416
|
f"Unrecognized processing class in {pretrained_model_name_or_path}. Can't instantiate a processor, a "
|
|
426
|
-
"tokenizer, an image processor or a feature extractor for this model.
|
|
427
|
-
"the files of at least one of those processing classes."
|
|
417
|
+
"tokenizer, an image processor, a video processor or a feature extractor for this model. "
|
|
418
|
+
"Make sure the repository contains the files of at least one of those processing classes."
|
|
428
419
|
)
|
|
429
420
|
|
|
430
421
|
@staticmethod
|
|
@@ -113,7 +113,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, str | None](
|
|
|
113
113
|
("emu3", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
114
114
|
("ernie", "BertTokenizer" if is_tokenizers_available() else None),
|
|
115
115
|
("esm", "EsmTokenizer"),
|
|
116
|
-
("exaone4", "GPT2Tokenizer" if is_tokenizers_available() else None),
|
|
117
116
|
("falcon_mamba", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
|
|
118
117
|
("fastspeech2_conformer", "FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None),
|
|
119
118
|
("flaubert", "FlaubertTokenizer"),
|
|
@@ -281,7 +280,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, str | None](
|
|
|
281
280
|
("seamless_m4t_v2", "SeamlessM4TTokenizer" if is_tokenizers_available() else None),
|
|
282
281
|
("shieldgemma2", "GemmaTokenizer" if is_tokenizers_available() else None),
|
|
283
282
|
("siglip", "SiglipTokenizer" if is_sentencepiece_available() else None),
|
|
284
|
-
("siglip2", "
|
|
283
|
+
("siglip2", "Siglip2Tokenizer" if is_tokenizers_available() else None),
|
|
285
284
|
("speech_to_text", "Speech2TextTokenizer" if is_sentencepiece_available() else None),
|
|
286
285
|
("speecht5", "SpeechT5Tokenizer" if is_sentencepiece_available() else None),
|
|
287
286
|
("splinter", "SplinterTokenizer"),
|
|
@@ -625,9 +624,21 @@ class AutoTokenizer:
|
|
|
625
624
|
# Next, let's try to use the tokenizer_config file to get the tokenizer class.
|
|
626
625
|
tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
|
|
627
626
|
tokenizer_config_class = tokenizer_config.get("tokenizer_class", None)
|
|
628
|
-
|
|
627
|
+
|
|
628
|
+
# Check for auto_map early to handle dynamic tokenizers properly
|
|
629
|
+
tokenizer_auto_map = None
|
|
630
|
+
if "auto_map" in tokenizer_config:
|
|
631
|
+
if isinstance(tokenizer_config["auto_map"], (tuple, list)):
|
|
632
|
+
# Legacy format for dynamic tokenizers
|
|
633
|
+
tokenizer_auto_map = tokenizer_config["auto_map"]
|
|
634
|
+
else:
|
|
635
|
+
tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
|
|
636
|
+
|
|
637
|
+
# if there is a config, we can check that the tokenizer class != than model class and can thus assume we need to use TokenizersBackend
|
|
638
|
+
# Skip this early exit if auto_map is present (custom tokenizer with trust_remote_code)
|
|
629
639
|
if (
|
|
630
|
-
|
|
640
|
+
tokenizer_auto_map is None
|
|
641
|
+
and tokenizer_config_class is not None
|
|
631
642
|
and config_model_type is not None
|
|
632
643
|
and config_model_type != ""
|
|
633
644
|
and TOKENIZER_MAPPING_NAMES.get(config_model_type, "").replace("Fast", "")
|
|
@@ -644,15 +655,6 @@ class AutoTokenizer:
|
|
|
644
655
|
if "_commit_hash" in tokenizer_config:
|
|
645
656
|
kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
|
|
646
657
|
|
|
647
|
-
# Check for auto_map early to handle dynamic tokenizers properly
|
|
648
|
-
tokenizer_auto_map = None
|
|
649
|
-
if "auto_map" in tokenizer_config:
|
|
650
|
-
if isinstance(tokenizer_config["auto_map"], (tuple, list)):
|
|
651
|
-
# Legacy format for dynamic tokenizers
|
|
652
|
-
tokenizer_auto_map = tokenizer_config["auto_map"]
|
|
653
|
-
else:
|
|
654
|
-
tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
|
|
655
|
-
|
|
656
658
|
if tokenizer_config_class:
|
|
657
659
|
tokenizer_config_class = tokenizer_config_class.replace("Fast", "")
|
|
658
660
|
|
|
@@ -697,7 +699,7 @@ class AutoTokenizer:
|
|
|
697
699
|
tokenizer_class = TokenizersBackend
|
|
698
700
|
|
|
699
701
|
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
700
|
-
elif getattr(config, "tokenizer_class"):
|
|
702
|
+
elif getattr(config, "tokenizer_class", None):
|
|
701
703
|
_class = config.tokenizer_class
|
|
702
704
|
if "PreTrainedTokenizerFast" not in _class:
|
|
703
705
|
_class = _class.replace("Fast", "")
|
|
@@ -716,7 +718,7 @@ class AutoTokenizer:
|
|
|
716
718
|
)
|
|
717
719
|
config = config.encoder
|
|
718
720
|
|
|
719
|
-
model_type = config_class_to_model_type(type(config).__name__) or config
|
|
721
|
+
model_type = config_class_to_model_type(type(config).__name__) or getattr(config, "model_type", None)
|
|
720
722
|
if model_type is not None:
|
|
721
723
|
tokenizer_class = TOKENIZER_MAPPING.get(type(config), TokenizersBackend)
|
|
722
724
|
if tokenizer_class is not None:
|
|
@@ -34,6 +34,7 @@ from ...modeling_outputs import BaseModelOutput, ModelOutput, SampleTSPrediction
|
|
|
34
34
|
from ...modeling_utils import PreTrainedModel
|
|
35
35
|
from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
|
|
36
36
|
from ...utils import auto_docstring, is_torch_flex_attn_available, logging
|
|
37
|
+
from ...utils.generic import is_flash_attention_requested
|
|
37
38
|
from .configuration_autoformer import AutoformerConfig
|
|
38
39
|
|
|
39
40
|
|
|
@@ -850,7 +851,7 @@ class AutoformerPreTrainedModel(PreTrainedModel):
|
|
|
850
851
|
inputs_embeds: torch.Tensor,
|
|
851
852
|
):
|
|
852
853
|
if attention_mask is not None:
|
|
853
|
-
if
|
|
854
|
+
if is_flash_attention_requested(self.config):
|
|
854
855
|
attention_mask = attention_mask if 0 in attention_mask else None
|
|
855
856
|
elif self.config._attn_implementation == "sdpa":
|
|
856
857
|
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
|
@@ -48,6 +48,8 @@ class AyaVisionConfig(PreTrainedConfig):
|
|
|
48
48
|
The epsilon value used for layer normalization in the adapter.
|
|
49
49
|
image_token_index (`int`, *optional*, defaults to 255036):
|
|
50
50
|
The image token index to encode the image prompt.
|
|
51
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
|
52
|
+
Whether to tie weight embeddings.
|
|
51
53
|
"""
|
|
52
54
|
|
|
53
55
|
model_type = "aya_vision"
|
|
@@ -65,11 +67,13 @@ class AyaVisionConfig(PreTrainedConfig):
|
|
|
65
67
|
downsample_factor=2,
|
|
66
68
|
adapter_layer_norm_eps=1e-6,
|
|
67
69
|
image_token_index=255036,
|
|
70
|
+
tie_word_embeddings=True,
|
|
68
71
|
**kwargs,
|
|
69
72
|
):
|
|
70
73
|
self.image_token_index = image_token_index
|
|
71
74
|
self.downsample_factor = downsample_factor
|
|
72
75
|
self.adapter_layer_norm_eps = adapter_layer_norm_eps
|
|
76
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
73
77
|
if vision_feature_select_strategy not in ["default", "full"]:
|
|
74
78
|
raise ValueError(
|
|
75
79
|
"vision_feature_select_strategy should be one of 'default', 'full'."
|
|
@@ -26,10 +26,10 @@ from torch import nn
|
|
|
26
26
|
from ...activations import ACT2FN
|
|
27
27
|
from ...cache_utils import Cache
|
|
28
28
|
from ...generation import GenerationMixin
|
|
29
|
-
from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
|
|
29
|
+
from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, ModelOutput
|
|
30
30
|
from ...modeling_utils import PreTrainedModel
|
|
31
31
|
from ...processing_utils import Unpack
|
|
32
|
-
from ...utils import TransformersKwargs, auto_docstring,
|
|
32
|
+
from ...utils import TransformersKwargs, auto_docstring, torch_compilable_check
|
|
33
33
|
from ...utils.generic import check_model_inputs
|
|
34
34
|
from ..auto import AutoModel
|
|
35
35
|
from .configuration_aya_vision import AyaVisionConfig
|
|
@@ -179,44 +179,26 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
|
|
|
179
179
|
def set_input_embeddings(self, value):
|
|
180
180
|
self.language_model.set_input_embeddings(value)
|
|
181
181
|
|
|
182
|
+
@check_model_inputs(tie_last_hidden_states=False)
|
|
183
|
+
@auto_docstring(
|
|
184
|
+
custom_intro="Obtains image last hidden states from the vision tower and apply multimodal projection."
|
|
185
|
+
)
|
|
182
186
|
def get_image_features(
|
|
183
187
|
self,
|
|
184
188
|
pixel_values: torch.FloatTensor,
|
|
185
189
|
vision_feature_layer: int | list[int] | None = None,
|
|
186
190
|
vision_feature_select_strategy: str | None = None,
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
Obtains image last hidden states from the vision tower and apply multimodal projection.
|
|
191
|
-
|
|
192
|
-
Args:
|
|
193
|
-
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
|
|
194
|
-
The tensors corresponding to the input images.
|
|
195
|
-
vision_feature_layer (`Union[int, list[int]]`, *optional*):
|
|
196
|
-
The index of the layer to select the vision feature. If multiple indices are provided,
|
|
197
|
-
the vision feature of the corresponding indices will be concatenated to form the
|
|
198
|
-
vision features.
|
|
199
|
-
vision_feature_select_strategy (`str`, *optional*):
|
|
200
|
-
The feature selection strategy used to select the vision feature from the vision backbone.
|
|
201
|
-
Can be one of `"default"` or `"full"`
|
|
202
|
-
Returns:
|
|
203
|
-
image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
|
|
204
|
-
"""
|
|
205
|
-
vision_feature_layer = (
|
|
206
|
-
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
|
207
|
-
)
|
|
208
|
-
vision_feature_select_strategy = (
|
|
209
|
-
vision_feature_select_strategy
|
|
210
|
-
if vision_feature_select_strategy is not None
|
|
211
|
-
else self.config.vision_feature_select_strategy
|
|
212
|
-
)
|
|
213
|
-
|
|
214
|
-
if vision_feature_select_strategy not in ["default", "full"]:
|
|
215
|
-
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
|
|
216
|
-
|
|
191
|
+
output_hidden_states: bool | None = None,
|
|
192
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
193
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
217
194
|
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
|
218
195
|
# this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
|
|
219
|
-
image_outputs = self.vision_tower(
|
|
196
|
+
image_outputs = self.vision_tower(
|
|
197
|
+
pixel_values,
|
|
198
|
+
output_hidden_states=True, # Ignore arg on purpose
|
|
199
|
+
return_dict=True,
|
|
200
|
+
**kwargs,
|
|
201
|
+
)
|
|
220
202
|
|
|
221
203
|
# If we have one vision feature layer, return the corresponding hidden states,
|
|
222
204
|
# otherwise, select the hidden states of each feature layer and concatenate them
|
|
@@ -231,8 +213,9 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
|
|
|
231
213
|
hs_pool = [hs[:, 1:] for hs in hs_pool]
|
|
232
214
|
selected_image_feature = torch.cat(hs_pool, dim=-1)
|
|
233
215
|
|
|
234
|
-
|
|
235
|
-
|
|
216
|
+
image_outputs.pooler_output = self.multi_modal_projector(selected_image_feature)
|
|
217
|
+
|
|
218
|
+
return image_outputs
|
|
236
219
|
|
|
237
220
|
def get_placeholder_mask(
|
|
238
221
|
self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
|
|
@@ -250,12 +233,12 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
|
|
|
250
233
|
special_image_mask = input_ids == self.config.image_token_id
|
|
251
234
|
|
|
252
235
|
n_image_tokens = special_image_mask.sum()
|
|
253
|
-
special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
|
254
236
|
n_image_features = image_features.shape[0] * image_features.shape[1]
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
237
|
+
special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
|
238
|
+
torch_compilable_check(
|
|
239
|
+
inputs_embeds[special_image_mask].numel() == image_features.numel(),
|
|
240
|
+
f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
|
|
241
|
+
)
|
|
259
242
|
return special_image_mask
|
|
260
243
|
|
|
261
244
|
@check_model_inputs
|
|
@@ -274,15 +257,6 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
|
|
|
274
257
|
cache_position: torch.LongTensor | None = None,
|
|
275
258
|
**kwargs: Unpack[TransformersKwargs],
|
|
276
259
|
) -> tuple | AyaVisionModelOutputWithPast:
|
|
277
|
-
vision_feature_layer = (
|
|
278
|
-
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
|
279
|
-
)
|
|
280
|
-
vision_feature_select_strategy = (
|
|
281
|
-
vision_feature_select_strategy
|
|
282
|
-
if vision_feature_select_strategy is not None
|
|
283
|
-
else self.config.vision_feature_select_strategy
|
|
284
|
-
)
|
|
285
|
-
|
|
286
260
|
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
287
261
|
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
|
288
262
|
|
|
@@ -294,7 +268,8 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
|
|
|
294
268
|
pixel_values=pixel_values,
|
|
295
269
|
vision_feature_layer=vision_feature_layer,
|
|
296
270
|
vision_feature_select_strategy=vision_feature_select_strategy,
|
|
297
|
-
|
|
271
|
+
return_dict=True,
|
|
272
|
+
).pooler_output
|
|
298
273
|
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
|
299
274
|
special_image_mask = self.get_placeholder_mask(
|
|
300
275
|
input_ids, inputs_embeds=inputs_embeds, image_features=image_features
|
|
@@ -349,13 +324,14 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
|
|
|
349
324
|
def get_output_embeddings(self) -> nn.Module:
|
|
350
325
|
return self.lm_head
|
|
351
326
|
|
|
327
|
+
@auto_docstring
|
|
352
328
|
def get_image_features(
|
|
353
329
|
self,
|
|
354
330
|
pixel_values: torch.FloatTensor,
|
|
355
331
|
vision_feature_layer: int | list[int] | None = None,
|
|
356
332
|
vision_feature_select_strategy: str | None = None,
|
|
357
|
-
**kwargs,
|
|
358
|
-
):
|
|
333
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
334
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
359
335
|
return self.model.get_image_features(
|
|
360
336
|
pixel_values=pixel_values,
|
|
361
337
|
vision_feature_layer=vision_feature_layer,
|
|
@@ -363,7 +339,7 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
|
|
|
363
339
|
**kwargs,
|
|
364
340
|
)
|
|
365
341
|
|
|
366
|
-
@
|
|
342
|
+
@check_model_inputs(tie_last_hidden_states=False)
|
|
367
343
|
@auto_docstring
|
|
368
344
|
def forward(
|
|
369
345
|
self,
|
|
@@ -417,15 +393,6 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
|
|
|
417
393
|
>>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3)
|
|
418
394
|
>>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
|
419
395
|
```"""
|
|
420
|
-
vision_feature_layer = (
|
|
421
|
-
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
|
422
|
-
)
|
|
423
|
-
vision_feature_select_strategy = (
|
|
424
|
-
vision_feature_select_strategy
|
|
425
|
-
if vision_feature_select_strategy is not None
|
|
426
|
-
else self.config.vision_feature_select_strategy
|
|
427
|
-
)
|
|
428
|
-
|
|
429
396
|
outputs = self.model(
|
|
430
397
|
input_ids=input_ids,
|
|
431
398
|
pixel_values=pixel_values,
|
|
@@ -22,13 +22,13 @@ from transformers.models.llava.modeling_llava import (
|
|
|
22
22
|
LlavaModel,
|
|
23
23
|
LlavaModelOutputWithPast,
|
|
24
24
|
LlavaPreTrainedModel,
|
|
25
|
-
TransformersKwargs,
|
|
26
25
|
)
|
|
27
26
|
|
|
28
27
|
from ...activations import ACT2FN
|
|
29
28
|
from ...cache_utils import Cache
|
|
29
|
+
from ...modeling_outputs import BaseModelOutputWithPooling
|
|
30
30
|
from ...processing_utils import Unpack
|
|
31
|
-
from ...utils import auto_docstring, logging
|
|
31
|
+
from ...utils import TransformersKwargs, auto_docstring, logging
|
|
32
32
|
from ...utils.generic import check_model_inputs
|
|
33
33
|
from .configuration_aya_vision import AyaVisionConfig
|
|
34
34
|
|
|
@@ -104,44 +104,26 @@ class AyaVisionModelOutputWithPast(LlavaModelOutputWithPast):
|
|
|
104
104
|
|
|
105
105
|
class AyaVisionModel(LlavaModel):
|
|
106
106
|
# Unlike LLaVA, the model doesn't have to deal with Pixtral-style image states
|
|
107
|
+
@check_model_inputs(tie_last_hidden_states=False)
|
|
108
|
+
@auto_docstring(
|
|
109
|
+
custom_intro="Obtains image last hidden states from the vision tower and apply multimodal projection."
|
|
110
|
+
)
|
|
107
111
|
def get_image_features(
|
|
108
112
|
self,
|
|
109
113
|
pixel_values: torch.FloatTensor,
|
|
110
114
|
vision_feature_layer: int | list[int] | None = None,
|
|
111
115
|
vision_feature_select_strategy: str | None = None,
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
Obtains image last hidden states from the vision tower and apply multimodal projection.
|
|
116
|
-
|
|
117
|
-
Args:
|
|
118
|
-
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
|
|
119
|
-
The tensors corresponding to the input images.
|
|
120
|
-
vision_feature_layer (`Union[int, list[int]]`, *optional*):
|
|
121
|
-
The index of the layer to select the vision feature. If multiple indices are provided,
|
|
122
|
-
the vision feature of the corresponding indices will be concatenated to form the
|
|
123
|
-
vision features.
|
|
124
|
-
vision_feature_select_strategy (`str`, *optional*):
|
|
125
|
-
The feature selection strategy used to select the vision feature from the vision backbone.
|
|
126
|
-
Can be one of `"default"` or `"full"`
|
|
127
|
-
Returns:
|
|
128
|
-
image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
|
|
129
|
-
"""
|
|
130
|
-
vision_feature_layer = (
|
|
131
|
-
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
|
132
|
-
)
|
|
133
|
-
vision_feature_select_strategy = (
|
|
134
|
-
vision_feature_select_strategy
|
|
135
|
-
if vision_feature_select_strategy is not None
|
|
136
|
-
else self.config.vision_feature_select_strategy
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
if vision_feature_select_strategy not in ["default", "full"]:
|
|
140
|
-
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
|
|
141
|
-
|
|
116
|
+
output_hidden_states: bool | None = None,
|
|
117
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
118
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
142
119
|
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
|
143
120
|
# this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
|
|
144
|
-
image_outputs = self.vision_tower(
|
|
121
|
+
image_outputs = self.vision_tower(
|
|
122
|
+
pixel_values,
|
|
123
|
+
output_hidden_states=True, # Ignore arg on purpose
|
|
124
|
+
return_dict=True,
|
|
125
|
+
**kwargs,
|
|
126
|
+
)
|
|
145
127
|
|
|
146
128
|
# If we have one vision feature layer, return the corresponding hidden states,
|
|
147
129
|
# otherwise, select the hidden states of each feature layer and concatenate them
|
|
@@ -156,8 +138,9 @@ class AyaVisionModel(LlavaModel):
|
|
|
156
138
|
hs_pool = [hs[:, 1:] for hs in hs_pool]
|
|
157
139
|
selected_image_feature = torch.cat(hs_pool, dim=-1)
|
|
158
140
|
|
|
159
|
-
|
|
160
|
-
|
|
141
|
+
image_outputs.pooler_output = self.multi_modal_projector(selected_image_feature)
|
|
142
|
+
|
|
143
|
+
return image_outputs
|
|
161
144
|
|
|
162
145
|
@check_model_inputs
|
|
163
146
|
@auto_docstring
|
|
@@ -175,15 +158,6 @@ class AyaVisionModel(LlavaModel):
|
|
|
175
158
|
cache_position: torch.LongTensor | None = None,
|
|
176
159
|
**kwargs: Unpack[TransformersKwargs],
|
|
177
160
|
) -> tuple | AyaVisionModelOutputWithPast:
|
|
178
|
-
vision_feature_layer = (
|
|
179
|
-
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
|
180
|
-
)
|
|
181
|
-
vision_feature_select_strategy = (
|
|
182
|
-
vision_feature_select_strategy
|
|
183
|
-
if vision_feature_select_strategy is not None
|
|
184
|
-
else self.config.vision_feature_select_strategy
|
|
185
|
-
)
|
|
186
|
-
|
|
187
161
|
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
188
162
|
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
|
189
163
|
|
|
@@ -195,7 +169,8 @@ class AyaVisionModel(LlavaModel):
|
|
|
195
169
|
pixel_values=pixel_values,
|
|
196
170
|
vision_feature_layer=vision_feature_layer,
|
|
197
171
|
vision_feature_select_strategy=vision_feature_select_strategy,
|
|
198
|
-
|
|
172
|
+
return_dict=True,
|
|
173
|
+
).pooler_output
|
|
199
174
|
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
|
200
175
|
special_image_mask = self.get_placeholder_mask(
|
|
201
176
|
input_ids, inputs_embeds=inputs_embeds, image_features=image_features
|
|
@@ -100,6 +100,12 @@ class BambaConfig(PreTrainedConfig):
|
|
|
100
100
|
Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
|
|
101
101
|
mamba_proj_bias (`bool`, *optional*, defaults to `False`):
|
|
102
102
|
Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
|
|
103
|
+
time_step_min (`float`, *optional*, defaults to 0.001):
|
|
104
|
+
Minimum `time_step` used to bound `dt_proj.bias`.
|
|
105
|
+
time_step_max (`float`, *optional*, defaults to 0.1):
|
|
106
|
+
Maximum `time_step` used to bound `dt_proj.bias`.
|
|
107
|
+
time_step_limit (`tuple`, *optional*, defaults to `(0.0, inf)`):
|
|
108
|
+
Accepted range of time step values for clamping.
|
|
103
109
|
z_loss_coefficient (`float`, *optional*, defaults to 0.0):
|
|
104
110
|
Coefficient for auxiliary z-loss used to control logit growth during training
|
|
105
111
|
rope_parameters (`RopeParameters`, *optional*):
|
|
@@ -140,6 +146,9 @@ class BambaConfig(PreTrainedConfig):
|
|
|
140
146
|
mamba_chunk_size: int | None = 256,
|
|
141
147
|
mamba_conv_bias: bool | None = True,
|
|
142
148
|
mamba_proj_bias: bool | None = False,
|
|
149
|
+
time_step_min: float | None = 0.001,
|
|
150
|
+
time_step_max: float | None = 0.1,
|
|
151
|
+
time_step_limit: tuple[float, float] | None = (0.0, float("inf")),
|
|
143
152
|
z_loss_coefficient: float | None = 0.0,
|
|
144
153
|
rope_parameters: RopeParameters | None = None,
|
|
145
154
|
**kwargs,
|
|
@@ -189,17 +198,18 @@ class BambaConfig(PreTrainedConfig):
|
|
|
189
198
|
self.mamba_chunk_size = mamba_chunk_size
|
|
190
199
|
self.mamba_conv_bias = mamba_conv_bias
|
|
191
200
|
self.mamba_proj_bias = mamba_proj_bias
|
|
201
|
+
self.time_step_min = time_step_min
|
|
202
|
+
self.time_step_max = time_step_max
|
|
203
|
+
self.time_step_limit = tuple(time_step_limit) if time_step_limit is not None else None
|
|
192
204
|
self.z_loss_coefficient = z_loss_coefficient
|
|
193
205
|
self.rope_parameters = rope_parameters
|
|
194
206
|
kwargs["partial_rotary_factor"] = 0.5 # hardcode for BC
|
|
195
207
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
**kwargs,
|
|
202
|
-
)
|
|
208
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
209
|
+
self.pad_token_id = pad_token_id
|
|
210
|
+
self.bos_token_id = bos_token_id
|
|
211
|
+
self.eos_token_id = eos_token_id
|
|
212
|
+
super().__init__(**kwargs)
|
|
203
213
|
|
|
204
214
|
@property
|
|
205
215
|
def layers_block_type(self):
|
|
@@ -41,8 +41,8 @@ from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
|
|
41
41
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
42
42
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
43
43
|
from ...processing_utils import Unpack
|
|
44
|
-
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
|
45
|
-
from ...utils.generic import maybe_autocast
|
|
44
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
|
|
45
|
+
from ...utils.generic import is_flash_attention_requested, maybe_autocast
|
|
46
46
|
from .configuration_bamba import BambaConfig
|
|
47
47
|
|
|
48
48
|
|
|
@@ -380,9 +380,9 @@ class BambaAttention(nn.Module):
|
|
|
380
380
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
|
381
381
|
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
|
382
382
|
|
|
383
|
-
attention_interface: Callable =
|
|
384
|
-
|
|
385
|
-
|
|
383
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
384
|
+
self.config._attn_implementation, eager_attention_forward
|
|
385
|
+
)
|
|
386
386
|
|
|
387
387
|
attn_output, attn_weights = attention_interface(
|
|
388
388
|
self,
|
|
@@ -518,10 +518,9 @@ class BambaMixer(nn.Module):
|
|
|
518
518
|
self.head_dim = config.mamba_d_head
|
|
519
519
|
self.chunk_size = config.mamba_chunk_size
|
|
520
520
|
|
|
521
|
-
|
|
522
|
-
self.
|
|
523
|
-
self.
|
|
524
|
-
self.time_step_max = 0.1
|
|
521
|
+
self.time_step_limit = config.time_step_limit
|
|
522
|
+
self.time_step_min = config.time_step_min
|
|
523
|
+
self.time_step_max = config.time_step_max
|
|
525
524
|
|
|
526
525
|
self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
|
|
527
526
|
self.conv1d = nn.Conv1d(
|
|
@@ -964,7 +963,7 @@ class BambaMixer(nn.Module):
|
|
|
964
963
|
seq_idx: torch.IntTensor | None = None,
|
|
965
964
|
**kwargs,
|
|
966
965
|
):
|
|
967
|
-
if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
|
|
966
|
+
if is_fast_path_available and "cuda" in self.in_proj.weight.device.type and not is_torchdynamo_compiling():
|
|
968
967
|
return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask, seq_idx)
|
|
969
968
|
if seq_idx is not None:
|
|
970
969
|
raise NotImplementedError(
|
|
@@ -1259,7 +1258,7 @@ class BambaModel(BambaPreTrainedModel):
|
|
|
1259
1258
|
past_key_values: HybridMambaAttentionDynamicCache,
|
|
1260
1259
|
output_attentions: bool,
|
|
1261
1260
|
):
|
|
1262
|
-
if self.config
|
|
1261
|
+
if is_flash_attention_requested(self.config):
|
|
1263
1262
|
if attention_mask is not None and 0.0 in attention_mask:
|
|
1264
1263
|
return attention_mask
|
|
1265
1264
|
return None
|
|
@@ -1382,7 +1381,7 @@ class BambaModel(BambaPreTrainedModel):
|
|
|
1382
1381
|
@auto_docstring
|
|
1383
1382
|
class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin):
|
|
1384
1383
|
_tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
|
|
1385
|
-
_tp_plan = {"lm_head": "
|
|
1384
|
+
_tp_plan = {"lm_head": "colwise_gather_output"}
|
|
1386
1385
|
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
|
|
1387
1386
|
|
|
1388
1387
|
def __init__(self, config):
|
|
@@ -1488,55 +1487,24 @@ class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin):
|
|
|
1488
1487
|
):
|
|
1489
1488
|
# Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache`
|
|
1490
1489
|
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
# If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
|
|
1494
|
-
# Exception 1: when passing input_embeds, input_ids may be missing entries
|
|
1495
|
-
# Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
|
|
1496
|
-
# Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
|
|
1497
|
-
# (we can't check exception 3 while compiling)
|
|
1498
|
-
if not empty_past_kv:
|
|
1499
|
-
if (
|
|
1500
|
-
inputs_embeds is not None # Exception 1
|
|
1501
|
-
or cache_position[-1] >= input_ids.shape[1] # Exception 3
|
|
1502
|
-
):
|
|
1503
|
-
input_ids = input_ids[:, -cache_position.shape[0] :]
|
|
1504
|
-
elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
|
|
1505
|
-
input_ids = input_ids[:, cache_position]
|
|
1506
|
-
else:
|
|
1490
|
+
if past_key_values is None:
|
|
1507
1491
|
past_key_values = HybridMambaAttentionDynamicCache(
|
|
1508
1492
|
self.config, input_ids.shape[0], self.dtype, device=self.device
|
|
1509
1493
|
)
|
|
1510
1494
|
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases
|
|
1523
|
-
|
|
1524
|
-
model_inputs.update(
|
|
1525
|
-
{
|
|
1526
|
-
"position_ids": position_ids,
|
|
1527
|
-
"past_key_values": past_key_values,
|
|
1528
|
-
"use_cache": use_cache,
|
|
1529
|
-
"attention_mask": attention_mask,
|
|
1530
|
-
"logits_to_keep": self.config.num_logits_to_keep,
|
|
1531
|
-
"cache_position": cache_position,
|
|
1532
|
-
}
|
|
1495
|
+
kwargs["logits_to_keep"] = self.config.num_logits_to_keep
|
|
1496
|
+
model_inputs = super().prepare_inputs_for_generation(
|
|
1497
|
+
input_ids,
|
|
1498
|
+
past_key_values=past_key_values,
|
|
1499
|
+
attention_mask=attention_mask,
|
|
1500
|
+
inputs_embeds=inputs_embeds,
|
|
1501
|
+
cache_position=cache_position,
|
|
1502
|
+
position_ids=position_ids,
|
|
1503
|
+
use_cache=use_cache,
|
|
1504
|
+
is_first_iteration=is_first_iteration,
|
|
1505
|
+
**kwargs,
|
|
1533
1506
|
)
|
|
1534
1507
|
|
|
1535
|
-
# Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
|
|
1536
|
-
for key, value in kwargs.items():
|
|
1537
|
-
if key not in model_inputs:
|
|
1538
|
-
model_inputs[key] = value
|
|
1539
|
-
|
|
1540
1508
|
return model_inputs
|
|
1541
1509
|
|
|
1542
1510
|
|