transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -13,10 +13,10 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""VitMatte model configuration"""
|
|
15
15
|
|
|
16
|
+
from ...backbone_utils import consolidate_backbone_kwargs_to_config
|
|
16
17
|
from ...configuration_utils import PreTrainedConfig
|
|
17
18
|
from ...utils import logging
|
|
18
|
-
from
|
|
19
|
-
from ..auto.configuration_auto import CONFIG_MAPPING, AutoConfig
|
|
19
|
+
from ..auto.configuration_auto import AutoConfig
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
logger = logging.get_logger(__name__)
|
|
@@ -35,18 +35,6 @@ class VitMatteConfig(PreTrainedConfig):
|
|
|
35
35
|
Args:
|
|
36
36
|
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `VitDetConfig()`):
|
|
37
37
|
The configuration of the backbone model.
|
|
38
|
-
backbone (`str`, *optional*):
|
|
39
|
-
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
40
|
-
will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
|
|
41
|
-
is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
|
|
42
|
-
use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
|
|
43
|
-
Whether to use pretrained weights for the backbone.
|
|
44
|
-
use_timm_backbone (`bool`, *optional*, defaults to `False`):
|
|
45
|
-
Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
|
|
46
|
-
library.
|
|
47
|
-
backbone_kwargs (`dict`, *optional*):
|
|
48
|
-
Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
|
|
49
|
-
e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
|
|
50
38
|
hidden_size (`int`, *optional*, defaults to 384):
|
|
51
39
|
The number of input channels of the decoder.
|
|
52
40
|
batch_norm_eps (`float`, *optional*, defaults to 1e-05):
|
|
@@ -79,10 +67,6 @@ class VitMatteConfig(PreTrainedConfig):
|
|
|
79
67
|
def __init__(
|
|
80
68
|
self,
|
|
81
69
|
backbone_config: PreTrainedConfig | None = None,
|
|
82
|
-
backbone=None,
|
|
83
|
-
use_pretrained_backbone=False,
|
|
84
|
-
use_timm_backbone=False,
|
|
85
|
-
backbone_kwargs=None,
|
|
86
70
|
hidden_size: int = 384,
|
|
87
71
|
batch_norm_eps: float = 1e-5,
|
|
88
72
|
initializer_range: float = 0.02,
|
|
@@ -90,27 +74,14 @@ class VitMatteConfig(PreTrainedConfig):
|
|
|
90
74
|
fusion_hidden_sizes: list[int] = [256, 128, 64, 32],
|
|
91
75
|
**kwargs,
|
|
92
76
|
):
|
|
93
|
-
|
|
94
|
-
logger.info("`backbone_config` is `None`. Initializing the config with the default `VitDet` backbone.")
|
|
95
|
-
backbone_config = CONFIG_MAPPING["vitdet"](out_features=["stage4"])
|
|
96
|
-
elif isinstance(backbone_config, dict):
|
|
97
|
-
backbone_model_type = backbone_config.get("model_type")
|
|
98
|
-
config_class = CONFIG_MAPPING[backbone_model_type]
|
|
99
|
-
backbone_config = config_class.from_dict(backbone_config)
|
|
100
|
-
|
|
101
|
-
verify_backbone_config_arguments(
|
|
102
|
-
use_timm_backbone=use_timm_backbone,
|
|
103
|
-
use_pretrained_backbone=use_pretrained_backbone,
|
|
104
|
-
backbone=backbone,
|
|
77
|
+
backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
|
|
105
78
|
backbone_config=backbone_config,
|
|
106
|
-
|
|
79
|
+
default_config_type="vitdet",
|
|
80
|
+
default_config_kwargs={"out_features": ["stage4"]},
|
|
81
|
+
**kwargs,
|
|
107
82
|
)
|
|
108
83
|
|
|
109
84
|
self.backbone_config = backbone_config
|
|
110
|
-
self.backbone = backbone
|
|
111
|
-
self.use_pretrained_backbone = use_pretrained_backbone
|
|
112
|
-
self.use_timm_backbone = use_timm_backbone
|
|
113
|
-
self.backbone_kwargs = backbone_kwargs
|
|
114
85
|
self.batch_norm_eps = batch_norm_eps
|
|
115
86
|
self.hidden_size = hidden_size
|
|
116
87
|
self.initializer_range = initializer_range
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
from typing import Union
|
|
17
17
|
|
|
18
18
|
import torch
|
|
19
|
-
|
|
19
|
+
import torchvision.transforms.v2.functional as tvF
|
|
20
20
|
|
|
21
21
|
from ...image_processing_utils import BatchFeature
|
|
22
22
|
from ...image_processing_utils_fast import (
|
|
@@ -81,7 +81,7 @@ class VitMatteImageProcessorFast(BaseImageProcessorFast):
|
|
|
81
81
|
|
|
82
82
|
if pad_width + pad_height > 0:
|
|
83
83
|
padding = (0, 0, pad_width, pad_height)
|
|
84
|
-
images =
|
|
84
|
+
images = tvF.pad(images, padding)
|
|
85
85
|
|
|
86
86
|
return images
|
|
87
87
|
|
|
@@ -19,9 +19,9 @@ import torch
|
|
|
19
19
|
from torch import nn
|
|
20
20
|
|
|
21
21
|
from ... import initialization as init
|
|
22
|
+
from ...backbone_utils import load_backbone
|
|
22
23
|
from ...modeling_utils import PreTrainedModel
|
|
23
24
|
from ...utils import ModelOutput, auto_docstring
|
|
24
|
-
from ...utils.backbone_utils import load_backbone
|
|
25
25
|
from .configuration_vitmatte import VitMatteConfig
|
|
26
26
|
|
|
27
27
|
|
|
@@ -13,10 +13,10 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""VitPose model configuration"""
|
|
15
15
|
|
|
16
|
+
from ...backbone_utils import consolidate_backbone_kwargs_to_config
|
|
16
17
|
from ...configuration_utils import PreTrainedConfig
|
|
17
18
|
from ...utils import logging
|
|
18
|
-
from
|
|
19
|
-
from ..auto.configuration_auto import CONFIG_MAPPING, AutoConfig
|
|
19
|
+
from ..auto.configuration_auto import AutoConfig
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
logger = logging.get_logger(__name__)
|
|
@@ -35,18 +35,6 @@ class VitPoseConfig(PreTrainedConfig):
|
|
|
35
35
|
Args:
|
|
36
36
|
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `VitPoseBackboneConfig()`):
|
|
37
37
|
The configuration of the backbone model. Currently, only `backbone_config` with `vitpose_backbone` as `model_type` is supported.
|
|
38
|
-
backbone (`str`, *optional*):
|
|
39
|
-
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
40
|
-
will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
|
|
41
|
-
is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
|
|
42
|
-
use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
|
|
43
|
-
Whether to use pretrained weights for the backbone.
|
|
44
|
-
use_timm_backbone (`bool`, *optional*, defaults to `False`):
|
|
45
|
-
Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
|
|
46
|
-
library.
|
|
47
|
-
backbone_kwargs (`dict`, *optional*):
|
|
48
|
-
Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
|
|
49
|
-
e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
|
|
50
38
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
51
39
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
52
40
|
scale_factor (`int`, *optional*, defaults to 4):
|
|
@@ -76,44 +64,19 @@ class VitPoseConfig(PreTrainedConfig):
|
|
|
76
64
|
def __init__(
|
|
77
65
|
self,
|
|
78
66
|
backbone_config: PreTrainedConfig | None = None,
|
|
79
|
-
backbone: str | None = None,
|
|
80
|
-
use_pretrained_backbone: bool = False,
|
|
81
|
-
use_timm_backbone: bool = False,
|
|
82
|
-
backbone_kwargs: dict | None = None,
|
|
83
67
|
initializer_range: float = 0.02,
|
|
84
68
|
scale_factor: int = 4,
|
|
85
69
|
use_simple_decoder: bool = True,
|
|
86
70
|
**kwargs,
|
|
87
71
|
):
|
|
88
|
-
|
|
89
|
-
logger.info(
|
|
90
|
-
"`use_pretrained_backbone` is `True`. For the pure inference purpose of VitPose weight do not set this value."
|
|
91
|
-
)
|
|
92
|
-
if use_timm_backbone:
|
|
93
|
-
raise ValueError("use_timm_backbone set `True` is not supported at the moment.")
|
|
94
|
-
|
|
95
|
-
if backbone_config is None and backbone is None:
|
|
96
|
-
logger.info("`backbone_config` is `None`. Initializing the config with the default `VitPose` backbone.")
|
|
97
|
-
backbone_config = CONFIG_MAPPING["vitpose_backbone"](out_indices=[4])
|
|
98
|
-
elif isinstance(backbone_config, dict):
|
|
99
|
-
backbone_model_type = backbone_config.get("model_type")
|
|
100
|
-
config_class = CONFIG_MAPPING[backbone_model_type]
|
|
101
|
-
backbone_config = config_class.from_dict(backbone_config)
|
|
102
|
-
|
|
103
|
-
verify_backbone_config_arguments(
|
|
104
|
-
use_timm_backbone=use_timm_backbone,
|
|
105
|
-
use_pretrained_backbone=use_pretrained_backbone,
|
|
106
|
-
backbone=backbone,
|
|
72
|
+
backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
|
|
107
73
|
backbone_config=backbone_config,
|
|
108
|
-
|
|
74
|
+
default_config_type="vitpose_backbone",
|
|
75
|
+
default_config_kwargs={"out_indices": [4]},
|
|
76
|
+
**kwargs,
|
|
109
77
|
)
|
|
110
78
|
|
|
111
79
|
self.backbone_config = backbone_config
|
|
112
|
-
self.backbone = backbone
|
|
113
|
-
self.use_pretrained_backbone = use_pretrained_backbone
|
|
114
|
-
self.use_timm_backbone = use_timm_backbone
|
|
115
|
-
self.backbone_kwargs = backbone_kwargs
|
|
116
|
-
|
|
117
80
|
self.initializer_range = initializer_range
|
|
118
81
|
self.scale_factor = scale_factor
|
|
119
82
|
self.use_simple_decoder = use_simple_decoder
|
|
@@ -19,11 +19,11 @@ import torch
|
|
|
19
19
|
from torch import nn
|
|
20
20
|
|
|
21
21
|
from ... import initialization as init
|
|
22
|
+
from ...backbone_utils import load_backbone
|
|
22
23
|
from ...modeling_outputs import BackboneOutput
|
|
23
24
|
from ...modeling_utils import PreTrainedModel
|
|
24
25
|
from ...processing_utils import Unpack
|
|
25
26
|
from ...utils import ModelOutput, TransformersKwargs, auto_docstring, logging
|
|
26
|
-
from ...utils.backbone_utils import load_backbone
|
|
27
27
|
from ...utils.generic import can_return_tuple
|
|
28
28
|
from .configuration_vitpose import VitPoseConfig
|
|
29
29
|
|
|
@@ -230,13 +230,15 @@ class VitPoseForPoseEstimation(VitPosePreTrainedModel):
|
|
|
230
230
|
>>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
|
|
231
231
|
>>> import torch
|
|
232
232
|
>>> from PIL import Image
|
|
233
|
-
>>> import
|
|
233
|
+
>>> import httpx
|
|
234
|
+
>>> from io import BytesIO
|
|
234
235
|
|
|
235
236
|
>>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
|
|
236
237
|
>>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")
|
|
237
238
|
|
|
238
239
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
239
|
-
>>>
|
|
240
|
+
>>> with httpx.stream("GET", url) as response:
|
|
241
|
+
... image = Image.open(BytesIO(response.read()))
|
|
240
242
|
>>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
|
|
241
243
|
>>> inputs = processor(image, boxes=boxes, return_tensors="pt")
|
|
242
244
|
|
|
@@ -13,9 +13,9 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""VitPose backbone configuration"""
|
|
15
15
|
|
|
16
|
+
from ...backbone_utils import BackboneConfigMixin
|
|
16
17
|
from ...configuration_utils import PreTrainedConfig
|
|
17
18
|
from ...utils import logging
|
|
18
|
-
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
logger = logging.get_logger(__name__)
|
|
@@ -130,9 +130,7 @@ class VitPoseBackboneConfig(BackboneConfigMixin, PreTrainedConfig):
|
|
|
130
130
|
self.num_channels = num_channels
|
|
131
131
|
self.qkv_bias = qkv_bias
|
|
132
132
|
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
|
|
133
|
-
self.
|
|
134
|
-
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
|
|
135
|
-
)
|
|
133
|
+
self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features)
|
|
136
134
|
|
|
137
135
|
|
|
138
136
|
__all__ = ["VitPoseBackboneConfig"]
|
|
@@ -26,12 +26,12 @@ from torch import nn
|
|
|
26
26
|
|
|
27
27
|
from ... import initialization as init
|
|
28
28
|
from ...activations import ACT2FN
|
|
29
|
+
from ...backbone_utils import BackboneMixin
|
|
29
30
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
30
31
|
from ...modeling_outputs import BackboneOutput, BaseModelOutput
|
|
31
32
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
32
33
|
from ...processing_utils import Unpack
|
|
33
34
|
from ...utils import TransformersKwargs, auto_docstring, logging
|
|
34
|
-
from ...utils.backbone_utils import BackboneMixin
|
|
35
35
|
from ...utils.generic import check_model_inputs
|
|
36
36
|
from .configuration_vitpose_backbone import VitPoseBackboneConfig
|
|
37
37
|
|
|
@@ -155,9 +155,9 @@ class VitPoseBackboneSelfAttention(nn.Module):
|
|
|
155
155
|
value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
|
|
156
156
|
query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
|
|
157
157
|
|
|
158
|
-
attention_interface: Callable =
|
|
159
|
-
|
|
160
|
-
|
|
158
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
159
|
+
self.config._attn_implementation, eager_attention_forward
|
|
160
|
+
)
|
|
161
161
|
|
|
162
162
|
context_layer, attention_probs = attention_interface(
|
|
163
163
|
self,
|
|
@@ -375,10 +375,9 @@ class VitPoseBackbonePreTrainedModel(PreTrainedModel):
|
|
|
375
375
|
The VitPose backbone useful for downstream tasks.
|
|
376
376
|
"""
|
|
377
377
|
)
|
|
378
|
-
class VitPoseBackbone(
|
|
378
|
+
class VitPoseBackbone(BackboneMixin, VitPoseBackbonePreTrainedModel):
|
|
379
379
|
def __init__(self, config: VitPoseBackboneConfig):
|
|
380
380
|
super().__init__(config)
|
|
381
|
-
super()._init_backbone(config)
|
|
382
381
|
|
|
383
382
|
self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
|
|
384
383
|
self.embeddings = VitPoseBackboneEmbeddings(config)
|
|
@@ -129,6 +129,8 @@ class VitsConfig(PreTrainedConfig):
|
|
|
129
129
|
How random the duration prediction is. Larger values create more variation in the predicted durations.
|
|
130
130
|
sampling_rate (`int`, *optional*, defaults to 16000):
|
|
131
131
|
The sampling rate at which the output audio waveform is digitalized expressed in hertz (Hz).
|
|
132
|
+
pad_token_id (`int`, *optional*):
|
|
133
|
+
Padding token id.
|
|
132
134
|
|
|
133
135
|
Example:
|
|
134
136
|
|
|
@@ -193,6 +195,7 @@ class VitsConfig(PreTrainedConfig):
|
|
|
193
195
|
noise_scale=0.667,
|
|
194
196
|
noise_scale_duration=0.8,
|
|
195
197
|
sampling_rate=16_000,
|
|
198
|
+
pad_token_id=None,
|
|
196
199
|
**kwargs,
|
|
197
200
|
):
|
|
198
201
|
self.vocab_size = vocab_size
|
|
@@ -239,6 +242,7 @@ class VitsConfig(PreTrainedConfig):
|
|
|
239
242
|
self.noise_scale = noise_scale
|
|
240
243
|
self.noise_scale_duration = noise_scale_duration
|
|
241
244
|
self.sampling_rate = sampling_rate
|
|
245
|
+
self.pad_token_id = pad_token_id
|
|
242
246
|
|
|
243
247
|
if len(upsample_kernel_sizes) != len(upsample_rates):
|
|
244
248
|
raise ValueError(
|
|
@@ -29,7 +29,7 @@ from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
|
|
|
29
29
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
30
30
|
from ...modeling_outputs import BaseModelOutput, ModelOutput
|
|
31
31
|
from ...modeling_utils import PreTrainedModel
|
|
32
|
-
from ...utils import auto_docstring, logging
|
|
32
|
+
from ...utils import auto_docstring, logging, torch_compilable_check
|
|
33
33
|
from .configuration_vits import VitsConfig
|
|
34
34
|
|
|
35
35
|
|
|
@@ -210,10 +210,10 @@ def _rational_quadratic_spline(
|
|
|
210
210
|
"""
|
|
211
211
|
upper_bound = tail_bound
|
|
212
212
|
lower_bound = -tail_bound
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
213
|
+
torch_compilable_check(
|
|
214
|
+
(inputs.min() >= lower_bound) & (inputs.max() <= upper_bound),
|
|
215
|
+
f"Inputs are outside the range [{lower_bound}, {upper_bound}]",
|
|
216
|
+
)
|
|
217
217
|
num_bins = unnormalized_widths.shape[-1]
|
|
218
218
|
|
|
219
219
|
if min_bin_width * num_bins > 1.0:
|
|
@@ -283,8 +283,10 @@ def _rational_quadratic_spline(
|
|
|
283
283
|
c = -input_delta * intermediate2
|
|
284
284
|
|
|
285
285
|
discriminant = b.pow(2) - 4 * a * c
|
|
286
|
-
|
|
287
|
-
|
|
286
|
+
torch_compilable_check(
|
|
287
|
+
torch.all(discriminant >= 0),
|
|
288
|
+
f"Discriminant has negative values {discriminant}",
|
|
289
|
+
)
|
|
288
290
|
|
|
289
291
|
root = (2 * c) / (-b - torch.sqrt(discriminant))
|
|
290
292
|
outputs = root * input_bin_widths + input_cumwidths
|
|
@@ -215,9 +215,9 @@ class VivitSelfAttention(nn.Module):
|
|
|
215
215
|
value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
|
|
216
216
|
query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
|
|
217
217
|
|
|
218
|
-
attention_interface: Callable =
|
|
219
|
-
|
|
220
|
-
|
|
218
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
219
|
+
self.config._attn_implementation, eager_attention_forward
|
|
220
|
+
)
|
|
221
221
|
|
|
222
222
|
context_layer, attention_probs = attention_interface(
|
|
223
223
|
self,
|
|
@@ -364,7 +364,7 @@ class VivitPreTrainedModel(PreTrainedModel):
|
|
|
364
364
|
main_input_name = "pixel_values"
|
|
365
365
|
input_modalities = "video"
|
|
366
366
|
supports_gradient_checkpointing = True
|
|
367
|
-
_no_split_modules = []
|
|
367
|
+
_no_split_modules = ["VivitLayer"]
|
|
368
368
|
_supports_sdpa = True
|
|
369
369
|
_supports_flash_attn = True
|
|
370
370
|
_supports_flex_attn = True
|
|
@@ -317,9 +317,9 @@ class VJEPA2RopeAttention(nn.Module):
|
|
|
317
317
|
key_layer = self.apply_rotary_embeddings(key_layer, pos_ids)
|
|
318
318
|
query_layer = self.apply_rotary_embeddings(query_layer, pos_ids)
|
|
319
319
|
|
|
320
|
-
attention_interface: Callable =
|
|
321
|
-
|
|
322
|
-
|
|
320
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
321
|
+
self.config._attn_implementation, eager_attention_forward
|
|
322
|
+
)
|
|
323
323
|
|
|
324
324
|
context_layer, attention_probs = attention_interface(
|
|
325
325
|
self,
|
|
@@ -726,9 +726,9 @@ class VJEPA2PoolerSelfAttention(nn.Module):
|
|
|
726
726
|
keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
|
|
727
727
|
values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
|
|
728
728
|
|
|
729
|
-
attention_interface: Callable =
|
|
730
|
-
|
|
731
|
-
|
|
729
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
730
|
+
self.config._attn_implementation, eager_attention_forward
|
|
731
|
+
)
|
|
732
732
|
|
|
733
733
|
attn_output, attn_weights = attention_interface(
|
|
734
734
|
self,
|
|
@@ -795,9 +795,9 @@ class VJEPA2PoolerCrossAttention(nn.Module):
|
|
|
795
795
|
keys = keys.view(batch_size, kv_seq_length, self.num_heads, self.head_dim).transpose(1, 2)
|
|
796
796
|
values = values.view(batch_size, kv_seq_length, self.num_heads, self.head_dim).transpose(1, 2)
|
|
797
797
|
|
|
798
|
-
attention_interface: Callable =
|
|
799
|
-
|
|
800
|
-
|
|
798
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
799
|
+
self.config._attn_implementation, eager_attention_forward
|
|
800
|
+
)
|
|
801
801
|
|
|
802
802
|
attn_output, attn_weights = attention_interface(
|
|
803
803
|
self,
|
|
@@ -189,7 +189,6 @@ class VoxtralConfig(PreTrainedConfig):
|
|
|
189
189
|
text_config = CONFIG_MAPPING["llama"](**self._default_text_config_kwargs)
|
|
190
190
|
self.text_config = text_config
|
|
191
191
|
|
|
192
|
-
self.vocab_size = text_config.vocab_size
|
|
193
192
|
self.hidden_size = text_config.hidden_size
|
|
194
193
|
self.audio_token_id = audio_token_id
|
|
195
194
|
self.projector_hidden_act = projector_hidden_act
|
|
@@ -29,7 +29,7 @@ from ...activations import ACT2FN
|
|
|
29
29
|
from ...cache_utils import Cache
|
|
30
30
|
from ...generation import GenerationMixin
|
|
31
31
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
32
|
-
from ...modeling_outputs import
|
|
32
|
+
from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, CausalLMOutputWithPast
|
|
33
33
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
34
34
|
from ...processing_utils import Unpack
|
|
35
35
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
|
@@ -133,9 +133,9 @@ class VoxtralAttention(nn.Module):
|
|
|
133
133
|
key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
|
|
134
134
|
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
|
|
135
135
|
|
|
136
|
-
attention_interface: Callable =
|
|
137
|
-
|
|
138
|
-
|
|
136
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
137
|
+
self.config._attn_implementation, eager_attention_forward
|
|
138
|
+
)
|
|
139
139
|
|
|
140
140
|
attn_output, attn_weights = attention_interface(
|
|
141
141
|
self,
|
|
@@ -261,7 +261,6 @@ class VoxtralEncoder(VoxtralPreTrainedModel):
|
|
|
261
261
|
|
|
262
262
|
embed_dim = config.d_model
|
|
263
263
|
self.num_mel_bins = config.num_mel_bins
|
|
264
|
-
self.padding_idx = config.pad_token_id
|
|
265
264
|
self.max_source_positions = config.max_source_positions
|
|
266
265
|
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
|
|
267
266
|
|
|
@@ -297,7 +296,7 @@ class VoxtralEncoder(VoxtralPreTrainedModel):
|
|
|
297
296
|
input_features,
|
|
298
297
|
attention_mask=None,
|
|
299
298
|
**kwargs: Unpack[TransformersKwargs],
|
|
300
|
-
):
|
|
299
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
301
300
|
r"""
|
|
302
301
|
Args:
|
|
303
302
|
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
|
|
@@ -334,7 +333,7 @@ class VoxtralEncoder(VoxtralPreTrainedModel):
|
|
|
334
333
|
|
|
335
334
|
hidden_states = self.layer_norm(hidden_states)
|
|
336
335
|
|
|
337
|
-
return
|
|
336
|
+
return BaseModelOutputWithPooling(
|
|
338
337
|
last_hidden_state=hidden_states,
|
|
339
338
|
)
|
|
340
339
|
|
|
@@ -398,26 +397,28 @@ class VoxtralForConditionalGeneration(VoxtralPreTrainedModel, GenerationMixin):
|
|
|
398
397
|
def get_decoder(self):
|
|
399
398
|
return self.language_model.get_decoder()
|
|
400
399
|
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
`
|
|
414
|
-
|
|
400
|
+
@can_return_tuple
|
|
401
|
+
@auto_docstring(
|
|
402
|
+
custom_intro="This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector."
|
|
403
|
+
)
|
|
404
|
+
def get_audio_features(
|
|
405
|
+
self, input_features: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
|
|
406
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
407
|
+
r"""
|
|
408
|
+
input_features (`torch.FloatTensor`):
|
|
409
|
+
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
|
|
410
|
+
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
|
|
411
|
+
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
|
|
412
|
+
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
|
|
413
|
+
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
|
|
415
414
|
"""
|
|
416
|
-
audio_outputs = self.audio_tower(input_features)
|
|
415
|
+
audio_outputs = self.audio_tower(input_features, return_dict=True, **kwargs)
|
|
417
416
|
audio_hidden_states = audio_outputs.last_hidden_state
|
|
418
417
|
audio_hidden_states = audio_hidden_states.reshape(-1, self.config.audio_config.intermediate_size)
|
|
419
418
|
audio_embeds = self.multi_modal_projector(audio_hidden_states)
|
|
420
|
-
|
|
419
|
+
audio_outputs.pooler_output = audio_embeds
|
|
420
|
+
|
|
421
|
+
return audio_outputs
|
|
421
422
|
|
|
422
423
|
@can_return_tuple
|
|
423
424
|
@auto_docstring
|
|
@@ -472,7 +473,7 @@ class VoxtralForConditionalGeneration(VoxtralPreTrainedModel, GenerationMixin):
|
|
|
472
473
|
inputs_embeds = self.get_input_embeddings()(input_ids)
|
|
473
474
|
|
|
474
475
|
if input_features is not None and input_ids is not None:
|
|
475
|
-
audio_embeds = self.get_audio_features(input_features)
|
|
476
|
+
audio_embeds = self.get_audio_features(input_features, return_dict=True).pooler_output
|
|
476
477
|
|
|
477
478
|
# replace text-audio token placeholders with audio embeddings
|
|
478
479
|
audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
|
|
@@ -19,7 +19,11 @@ from torch import nn
|
|
|
19
19
|
from ...activations import ACT2FN
|
|
20
20
|
from ...cache_utils import Cache
|
|
21
21
|
from ...generation import GenerationMixin
|
|
22
|
-
from ...modeling_outputs import
|
|
22
|
+
from ...modeling_outputs import (
|
|
23
|
+
BaseModelOutputWithPast,
|
|
24
|
+
BaseModelOutputWithPooling,
|
|
25
|
+
CausalLMOutputWithPast,
|
|
26
|
+
)
|
|
23
27
|
from ...processing_utils import Unpack
|
|
24
28
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
25
29
|
from ...utils.generic import check_model_inputs
|
|
@@ -67,7 +71,7 @@ class VoxtralEncoder(Qwen2AudioEncoder):
|
|
|
67
71
|
input_features,
|
|
68
72
|
attention_mask=None,
|
|
69
73
|
**kwargs: Unpack[TransformersKwargs],
|
|
70
|
-
):
|
|
74
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
71
75
|
r"""
|
|
72
76
|
Args:
|
|
73
77
|
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
|
|
@@ -104,7 +108,7 @@ class VoxtralEncoder(Qwen2AudioEncoder):
|
|
|
104
108
|
|
|
105
109
|
hidden_states = self.layer_norm(hidden_states)
|
|
106
110
|
|
|
107
|
-
return
|
|
111
|
+
return BaseModelOutputWithPooling(
|
|
108
112
|
last_hidden_state=hidden_states,
|
|
109
113
|
)
|
|
110
114
|
|
|
@@ -159,26 +163,28 @@ class VoxtralForConditionalGeneration(VoxtralPreTrainedModel, GenerationMixin):
|
|
|
159
163
|
def get_decoder(self):
|
|
160
164
|
return self.language_model.get_decoder()
|
|
161
165
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
`
|
|
175
|
-
|
|
166
|
+
@can_return_tuple
|
|
167
|
+
@auto_docstring(
|
|
168
|
+
custom_intro="This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector."
|
|
169
|
+
)
|
|
170
|
+
def get_audio_features(
|
|
171
|
+
self, input_features: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
|
|
172
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
173
|
+
r"""
|
|
174
|
+
input_features (`torch.FloatTensor`):
|
|
175
|
+
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
|
|
176
|
+
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
|
|
177
|
+
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
|
|
178
|
+
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
|
|
179
|
+
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
|
|
176
180
|
"""
|
|
177
|
-
audio_outputs = self.audio_tower(input_features)
|
|
181
|
+
audio_outputs = self.audio_tower(input_features, return_dict=True, **kwargs)
|
|
178
182
|
audio_hidden_states = audio_outputs.last_hidden_state
|
|
179
183
|
audio_hidden_states = audio_hidden_states.reshape(-1, self.config.audio_config.intermediate_size)
|
|
180
184
|
audio_embeds = self.multi_modal_projector(audio_hidden_states)
|
|
181
|
-
|
|
185
|
+
audio_outputs.pooler_output = audio_embeds
|
|
186
|
+
|
|
187
|
+
return audio_outputs
|
|
182
188
|
|
|
183
189
|
@can_return_tuple
|
|
184
190
|
@auto_docstring
|
|
@@ -233,7 +239,7 @@ class VoxtralForConditionalGeneration(VoxtralPreTrainedModel, GenerationMixin):
|
|
|
233
239
|
inputs_embeds = self.get_input_embeddings()(input_ids)
|
|
234
240
|
|
|
235
241
|
if input_features is not None and input_ids is not None:
|
|
236
|
-
audio_embeds = self.get_audio_features(input_features)
|
|
242
|
+
audio_embeds = self.get_audio_features(input_features, return_dict=True).pooler_output
|
|
237
243
|
|
|
238
244
|
# replace text-audio token placeholders with audio embeddings
|
|
239
245
|
audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
|
|
@@ -259,7 +259,10 @@ class Wav2Vec2Config(PreTrainedConfig):
|
|
|
259
259
|
adapter_attn_dim=None,
|
|
260
260
|
**kwargs,
|
|
261
261
|
):
|
|
262
|
-
super().__init__(**kwargs
|
|
262
|
+
super().__init__(**kwargs)
|
|
263
|
+
self.pad_token_id = pad_token_id
|
|
264
|
+
self.bos_token_id = bos_token_id
|
|
265
|
+
self.eos_token_id = eos_token_id
|
|
263
266
|
self.hidden_size = hidden_size
|
|
264
267
|
self.feat_extract_norm = feat_extract_norm
|
|
265
268
|
self.feat_extract_activation = feat_extract_activation
|
|
@@ -39,7 +39,7 @@ from ...modeling_outputs import (
|
|
|
39
39
|
Wav2Vec2BaseModelOutput,
|
|
40
40
|
XVectorOutput,
|
|
41
41
|
)
|
|
42
|
-
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
42
|
+
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel, get_torch_context_manager_or_global_device
|
|
43
43
|
from ...processing_utils import Unpack
|
|
44
44
|
from ...utils import (
|
|
45
45
|
ModelOutput,
|
|
@@ -528,9 +528,9 @@ class Wav2Vec2Attention(nn.Module):
|
|
|
528
528
|
key_states = self.k_proj(current_states).view(*kv_input_shape).transpose(1, 2)
|
|
529
529
|
value_states = self.v_proj(current_states).view(*kv_input_shape).transpose(1, 2)
|
|
530
530
|
|
|
531
|
-
attention_interface: Callable =
|
|
532
|
-
|
|
533
|
-
|
|
531
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
532
|
+
self.config._attn_implementation, eager_attention_forward
|
|
533
|
+
)
|
|
534
534
|
|
|
535
535
|
attn_output, attn_weights = attention_interface(
|
|
536
536
|
self,
|
|
@@ -1641,6 +1641,9 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
|
|
|
1641
1641
|
This method is **not** supposed to be called by the user and is prone to be changed in the future.
|
|
1642
1642
|
"""
|
|
1643
1643
|
|
|
1644
|
+
if get_torch_context_manager_or_global_device() == torch.device("meta"):
|
|
1645
|
+
return
|
|
1646
|
+
|
|
1644
1647
|
# Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
|
|
1645
1648
|
# correctly load adapter layers for Wav2Vec2 so that we do not have to introduce a new API to
|
|
1646
1649
|
# [`PreTrainedModel`]. While slightly hacky, Wav2Vec2 never has to tie input and output embeddings, so that it is
|