transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -25,11 +25,7 @@ from ...generation import GenerationMixin
|
|
|
25
25
|
from ...integrations import lazy_load_kernel
|
|
26
26
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
27
27
|
from ...modeling_utils import PreTrainedModel
|
|
28
|
-
from ...utils import
|
|
29
|
-
ModelOutput,
|
|
30
|
-
auto_docstring,
|
|
31
|
-
logging,
|
|
32
|
-
)
|
|
28
|
+
from ...utils import ModelOutput, auto_docstring, is_torchdynamo_compiling, logging
|
|
33
29
|
from .configuration_mamba2 import Mamba2Config
|
|
34
30
|
|
|
35
31
|
|
|
@@ -658,7 +654,7 @@ class Mamba2Mixer(nn.Module):
|
|
|
658
654
|
cache_position: torch.LongTensor | None = None,
|
|
659
655
|
attention_mask: torch.Tensor | None = None,
|
|
660
656
|
):
|
|
661
|
-
if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
|
|
657
|
+
if is_fast_path_available and "cuda" in self.in_proj.weight.device.type and not is_torchdynamo_compiling():
|
|
662
658
|
return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
|
|
663
659
|
return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
|
|
664
660
|
|
|
@@ -930,7 +926,7 @@ class Mamba2Model(Mamba2PreTrainedModel):
|
|
|
930
926
|
"""
|
|
931
927
|
)
|
|
932
928
|
class Mamba2ForCausalLM(Mamba2PreTrainedModel, GenerationMixin):
|
|
933
|
-
_tied_weights_keys = {}
|
|
929
|
+
_tied_weights_keys = {"lm_head.weight": "backbone.embeddings.weight"}
|
|
934
930
|
|
|
935
931
|
def __init__(self, config):
|
|
936
932
|
super().__init__(config)
|
|
@@ -956,41 +952,34 @@ class Mamba2ForCausalLM(Mamba2PreTrainedModel, GenerationMixin):
|
|
|
956
952
|
is_first_iteration: bool | None = False,
|
|
957
953
|
**kwargs,
|
|
958
954
|
):
|
|
959
|
-
# Overwritten --
|
|
960
|
-
|
|
955
|
+
# Overwritten -- has custom cache class `Mamba2Cache`
|
|
956
|
+
|
|
957
|
+
model_inputs = super().prepare_inputs_for_generation(
|
|
958
|
+
input_ids,
|
|
959
|
+
inputs_embeds=inputs_embeds,
|
|
960
|
+
use_cache=use_cache,
|
|
961
|
+
cache_params=cache_params,
|
|
962
|
+
cache_position=cache_position,
|
|
963
|
+
attention_mask=attention_mask,
|
|
964
|
+
is_first_iteration=is_first_iteration,
|
|
965
|
+
**kwargs,
|
|
966
|
+
)
|
|
967
|
+
|
|
961
968
|
if use_cache and cache_params is None:
|
|
962
969
|
# we initialize the `cache_position` to full size of `conv_states` at prefill stage
|
|
963
970
|
# considering padding will be applied when input length is shorter, and truncation
|
|
964
971
|
# will be applied when it is longer, so it will be equivalent to always have it match
|
|
965
972
|
# the length of `cache_params.conv_states`, which is `config.conv_kernel`
|
|
966
|
-
cache_position = torch.arange(0, self.backbone.config.conv_kernel, device=input_ids.device)
|
|
973
|
+
model_inputs["cache_position"] = torch.arange(0, self.backbone.config.conv_kernel, device=input_ids.device)
|
|
967
974
|
if inputs_embeds is not None:
|
|
968
|
-
model_inputs = {"inputs_embeds": inputs_embeds}
|
|
969
975
|
max_batch_size = inputs_embeds.size(0)
|
|
970
976
|
else:
|
|
971
977
|
max_batch_size = input_ids.size(0)
|
|
972
|
-
cache_params = Mamba2Cache(
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
attention_mask = None
|
|
977
|
-
|
|
978
|
-
if not use_cache and inputs_embeds is not None:
|
|
979
|
-
model_inputs = {"inputs_embeds": inputs_embeds}
|
|
980
|
-
|
|
981
|
-
model_inputs.update(
|
|
982
|
-
{
|
|
983
|
-
"cache_params": cache_params,
|
|
984
|
-
"use_cache": use_cache,
|
|
985
|
-
"cache_position": cache_position,
|
|
986
|
-
"attention_mask": attention_mask,
|
|
987
|
-
}
|
|
988
|
-
)
|
|
989
|
-
|
|
990
|
-
# Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
|
|
991
|
-
for key, value in kwargs.items():
|
|
992
|
-
if key not in model_inputs:
|
|
993
|
-
model_inputs[key] = value
|
|
978
|
+
model_inputs["cache_params"] = Mamba2Cache(
|
|
979
|
+
self.backbone.config, max_batch_size, device=self.device, dtype=self.dtype
|
|
980
|
+
)
|
|
981
|
+
elif use_cache and cache_position[0] > 0:
|
|
982
|
+
model_inputs["attention_mask"] = None
|
|
994
983
|
|
|
995
984
|
return model_inputs
|
|
996
985
|
|
|
@@ -121,10 +121,15 @@ class MarianConfig(PreTrainedConfig):
|
|
|
121
121
|
scale_embedding=False,
|
|
122
122
|
pad_token_id=58100,
|
|
123
123
|
eos_token_id=0,
|
|
124
|
+
bos_token_id=None,
|
|
124
125
|
forced_eos_token_id=0,
|
|
125
126
|
share_encoder_decoder_embeddings=True,
|
|
127
|
+
is_decoder=False,
|
|
128
|
+
tie_word_embeddings=True,
|
|
126
129
|
**kwargs,
|
|
127
130
|
):
|
|
131
|
+
self.is_decoder = is_decoder
|
|
132
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
128
133
|
self.vocab_size = vocab_size
|
|
129
134
|
self.decoder_vocab_size = decoder_vocab_size or vocab_size
|
|
130
135
|
self.max_position_embeddings = max_position_embeddings
|
|
@@ -146,12 +151,13 @@ class MarianConfig(PreTrainedConfig):
|
|
|
146
151
|
self.num_hidden_layers = encoder_layers
|
|
147
152
|
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
|
148
153
|
self.share_encoder_decoder_embeddings = share_encoder_decoder_embeddings
|
|
149
|
-
|
|
154
|
+
self.pad_token_id = pad_token_id
|
|
155
|
+
self.eos_token_id = eos_token_id
|
|
156
|
+
self.bos_token_id = bos_token_id
|
|
157
|
+
self.decoder_start_token_id = decoder_start_token_id
|
|
158
|
+
self.tie_word_embeddings = share_encoder_decoder_embeddings
|
|
150
159
|
super().__init__(
|
|
151
|
-
pad_token_id=pad_token_id,
|
|
152
|
-
eos_token_id=eos_token_id,
|
|
153
160
|
is_encoder_decoder=is_encoder_decoder,
|
|
154
|
-
decoder_start_token_id=decoder_start_token_id,
|
|
155
161
|
forced_eos_token_id=forced_eos_token_id,
|
|
156
162
|
**kwargs,
|
|
157
163
|
)
|
|
@@ -234,9 +234,9 @@ class MarianAttention(nn.Module):
|
|
|
234
234
|
if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
|
|
235
235
|
past_key_values.is_updated[self.layer_idx] = True
|
|
236
236
|
|
|
237
|
-
attention_interface: Callable =
|
|
238
|
-
|
|
239
|
-
|
|
237
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
238
|
+
self.config._attn_implementation, eager_attention_forward
|
|
239
|
+
)
|
|
240
240
|
|
|
241
241
|
attn_output, attn_weights = attention_interface(
|
|
242
242
|
self,
|
|
@@ -1307,7 +1307,7 @@ class MarianForCausalLM(MarianPreTrainedModel, GenerationMixin):
|
|
|
1307
1307
|
>>> from transformers import AutoTokenizer, MarianForCausalLM
|
|
1308
1308
|
|
|
1309
1309
|
>>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
|
|
1310
|
-
>>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en"
|
|
1310
|
+
>>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
|
|
1311
1311
|
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
|
|
1312
1312
|
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
|
1313
1313
|
>>> outputs = model(**inputs)
|
|
@@ -121,12 +121,10 @@ class MarkupLMConfig(PreTrainedConfig):
|
|
|
121
121
|
classifier_dropout=None,
|
|
122
122
|
**kwargs,
|
|
123
123
|
):
|
|
124
|
-
super().__init__(
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
**kwargs,
|
|
129
|
-
)
|
|
124
|
+
super().__init__(**kwargs)
|
|
125
|
+
self.pad_token_id = pad_token_id
|
|
126
|
+
self.bos_token_id = bos_token_id
|
|
127
|
+
self.eos_token_id = eos_token_id
|
|
130
128
|
self.vocab_size = vocab_size
|
|
131
129
|
self.hidden_size = hidden_size
|
|
132
130
|
self.num_hidden_layers = num_hidden_layers
|
|
@@ -373,9 +373,9 @@ class MarkupLMSelfAttention(nn.Module):
|
|
|
373
373
|
key_states = self.key(hidden_states).view(hidden_shape).transpose(1, 2)
|
|
374
374
|
value_states = self.value(hidden_states).view(hidden_shape).transpose(1, 2)
|
|
375
375
|
|
|
376
|
-
attention_interface: Callable =
|
|
377
|
-
|
|
378
|
-
|
|
376
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
377
|
+
self.config._attn_implementation, eager_attention_forward
|
|
378
|
+
)
|
|
379
379
|
|
|
380
380
|
attn_output, attn_weights = attention_interface(
|
|
381
381
|
self,
|
|
@@ -13,10 +13,10 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""Mask2Former model configuration"""
|
|
15
15
|
|
|
16
|
+
from ...backbone_utils import consolidate_backbone_kwargs_to_config
|
|
16
17
|
from ...configuration_utils import PreTrainedConfig
|
|
17
18
|
from ...utils import logging
|
|
18
|
-
from
|
|
19
|
-
from ..auto import CONFIG_MAPPING, AutoConfig
|
|
19
|
+
from ..auto import AutoConfig
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
logger = logging.get_logger(__name__)
|
|
@@ -39,18 +39,6 @@ class Mask2FormerConfig(PreTrainedConfig):
|
|
|
39
39
|
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
|
|
40
40
|
The configuration of the backbone model. If unset, the configuration corresponding to
|
|
41
41
|
`swin-base-patch4-window12-384` will be used.
|
|
42
|
-
backbone (`str`, *optional*):
|
|
43
|
-
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
44
|
-
will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
|
|
45
|
-
is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
|
|
46
|
-
use_pretrained_backbone (`bool`, *optional*, `False`):
|
|
47
|
-
Whether to use pretrained weights for the backbone.
|
|
48
|
-
use_timm_backbone (`bool`, *optional*, `False`):
|
|
49
|
-
Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
|
|
50
|
-
library.
|
|
51
|
-
backbone_kwargs (`dict`, *optional*):
|
|
52
|
-
Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
|
|
53
|
-
e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
|
|
54
42
|
feature_size (`int`, *optional*, defaults to 256):
|
|
55
43
|
The features (channels) of the resulting feature maps.
|
|
56
44
|
mask_feature_size (`int`, *optional*, defaults to 256):
|
|
@@ -159,40 +147,21 @@ class Mask2FormerConfig(PreTrainedConfig):
|
|
|
159
147
|
use_auxiliary_loss: bool = True,
|
|
160
148
|
feature_strides: list[int] = [4, 8, 16, 32],
|
|
161
149
|
output_auxiliary_logits: bool | None = None,
|
|
162
|
-
backbone: str | None = None,
|
|
163
|
-
use_pretrained_backbone: bool = False,
|
|
164
|
-
use_timm_backbone: bool = False,
|
|
165
|
-
backbone_kwargs: dict | None = None,
|
|
166
150
|
**kwargs,
|
|
167
151
|
):
|
|
168
|
-
|
|
169
|
-
logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.")
|
|
170
|
-
backbone_config = CONFIG_MAPPING["swin"](
|
|
171
|
-
image_size=224,
|
|
172
|
-
num_channels=3,
|
|
173
|
-
patch_size=4,
|
|
174
|
-
embed_dim=96,
|
|
175
|
-
depths=[2, 2, 18, 2],
|
|
176
|
-
num_heads=[3, 6, 12, 24],
|
|
177
|
-
window_size=7,
|
|
178
|
-
drop_path_rate=0.3,
|
|
179
|
-
use_absolute_embeddings=False,
|
|
180
|
-
out_features=["stage1", "stage2", "stage3", "stage4"],
|
|
181
|
-
)
|
|
182
|
-
elif isinstance(backbone_config, dict):
|
|
183
|
-
backbone_model_type = backbone_config.pop("model_type")
|
|
184
|
-
config_class = CONFIG_MAPPING[backbone_model_type]
|
|
185
|
-
backbone_config = config_class.from_dict(backbone_config)
|
|
186
|
-
|
|
187
|
-
verify_backbone_config_arguments(
|
|
188
|
-
use_timm_backbone=use_timm_backbone,
|
|
189
|
-
use_pretrained_backbone=use_pretrained_backbone,
|
|
190
|
-
backbone=backbone,
|
|
152
|
+
backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
|
|
191
153
|
backbone_config=backbone_config,
|
|
192
|
-
|
|
154
|
+
default_config_type="swin",
|
|
155
|
+
default_config_kwargs={
|
|
156
|
+
"depths": [2, 2, 18, 2],
|
|
157
|
+
"drop_path_rate": 0.3,
|
|
158
|
+
"out_features": ["stage1", "stage2", "stage3", "stage4"],
|
|
159
|
+
},
|
|
160
|
+
**kwargs,
|
|
193
161
|
)
|
|
162
|
+
|
|
194
163
|
# verify that the backbone is supported
|
|
195
|
-
if backbone_config
|
|
164
|
+
if backbone_config.model_type not in self.backbones_supported:
|
|
196
165
|
logger.warning_once(
|
|
197
166
|
f"Backbone {backbone_config.model_type} is not a supported model and may not be compatible with Mask2Former. "
|
|
198
167
|
f"Supported model types: {','.join(self.backbones_supported)}"
|
|
@@ -227,10 +196,6 @@ class Mask2FormerConfig(PreTrainedConfig):
|
|
|
227
196
|
self.feature_strides = feature_strides
|
|
228
197
|
self.output_auxiliary_logits = output_auxiliary_logits
|
|
229
198
|
self.num_hidden_layers = decoder_layers
|
|
230
|
-
self.backbone = backbone
|
|
231
|
-
self.use_pretrained_backbone = use_pretrained_backbone
|
|
232
|
-
self.use_timm_backbone = use_timm_backbone
|
|
233
|
-
self.backbone_kwargs = backbone_kwargs
|
|
234
199
|
|
|
235
200
|
super().__init__(**kwargs)
|
|
236
201
|
|
|
@@ -22,8 +22,8 @@ import math
|
|
|
22
22
|
from typing import Any, Optional, Union
|
|
23
23
|
|
|
24
24
|
import torch
|
|
25
|
+
import torchvision.transforms.v2.functional as tvF
|
|
25
26
|
from torch import nn
|
|
26
|
-
from torchvision.transforms.v2 import functional as F
|
|
27
27
|
|
|
28
28
|
from transformers.image_transforms import get_size_with_aspect_ratio
|
|
29
29
|
|
|
@@ -144,7 +144,7 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
144
144
|
image: torch.Tensor,
|
|
145
145
|
size: SizeDict,
|
|
146
146
|
size_divisor: int = 0,
|
|
147
|
-
interpolation: Optional["
|
|
147
|
+
interpolation: Optional["tvF.InterpolationMode"] = None,
|
|
148
148
|
**kwargs,
|
|
149
149
|
) -> torch.Tensor:
|
|
150
150
|
"""
|
|
@@ -169,7 +169,7 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
169
169
|
interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
|
|
170
170
|
Resampling filter to use if resizing the image.
|
|
171
171
|
"""
|
|
172
|
-
interpolation = interpolation if interpolation is not None else
|
|
172
|
+
interpolation = interpolation if interpolation is not None else tvF.InterpolationMode.BILINEAR
|
|
173
173
|
if size.shortest_edge and size.longest_edge:
|
|
174
174
|
# Resize the image so that the shortest edge or the longest edge is of the given size
|
|
175
175
|
# while maintaining the aspect ratio of the original image.
|
|
@@ -193,7 +193,7 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
193
193
|
width = int(math.ceil(width / size_divisor) * size_divisor)
|
|
194
194
|
new_size = (height, width)
|
|
195
195
|
|
|
196
|
-
image =
|
|
196
|
+
image = tvF.resize(
|
|
197
197
|
image,
|
|
198
198
|
size=new_size,
|
|
199
199
|
interpolation=interpolation,
|
|
@@ -219,9 +219,9 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
219
219
|
)
|
|
220
220
|
if original_size != padded_size:
|
|
221
221
|
padding = [0, 0, padding_right, padding_bottom]
|
|
222
|
-
images =
|
|
222
|
+
images = tvF.pad(images, padding, fill=fill)
|
|
223
223
|
if segmentation_maps is not None:
|
|
224
|
-
segmentation_maps = [
|
|
224
|
+
segmentation_maps = [tvF.pad(mask, padding, fill=ignore_index) for mask in segmentation_maps]
|
|
225
225
|
|
|
226
226
|
# Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
|
|
227
227
|
pixel_mask = torch.zeros((images.shape[0], *padded_size), dtype=torch.int64, device=images.device)
|
|
@@ -287,7 +287,7 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
287
287
|
size: SizeDict | None,
|
|
288
288
|
pad_size: SizeDict | None,
|
|
289
289
|
size_divisor: int | None,
|
|
290
|
-
interpolation: Union["PILImageResampling", "
|
|
290
|
+
interpolation: Union["PILImageResampling", "tvF.InterpolationMode"] | None,
|
|
291
291
|
do_rescale: bool | None,
|
|
292
292
|
rescale_factor: float | None,
|
|
293
293
|
do_normalize: bool | None,
|
|
@@ -322,7 +322,7 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
322
322
|
image=stacked_segmentation_maps,
|
|
323
323
|
size=size,
|
|
324
324
|
size_divisor=size_divisor,
|
|
325
|
-
interpolation=
|
|
325
|
+
interpolation=tvF.InterpolationMode.NEAREST_EXACT,
|
|
326
326
|
)
|
|
327
327
|
resized_images_grouped[shape] = stacked_images
|
|
328
328
|
if segmentation_maps is not None:
|
|
@@ -23,13 +23,13 @@ from torch import Tensor, nn
|
|
|
23
23
|
|
|
24
24
|
from ... import initialization as init
|
|
25
25
|
from ...activations import ACT2FN
|
|
26
|
+
from ...backbone_utils import load_backbone
|
|
26
27
|
from ...file_utils import ModelOutput, is_scipy_available, requires_backends
|
|
27
28
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
28
29
|
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions
|
|
29
30
|
from ...modeling_utils import PreTrainedModel
|
|
30
31
|
from ...pytorch_utils import compile_compatible_method_lru_cache
|
|
31
|
-
from ...utils import auto_docstring, is_accelerate_available, logging
|
|
32
|
-
from ...utils.backbone_utils import load_backbone
|
|
32
|
+
from ...utils import auto_docstring, is_accelerate_available, logging, torch_compilable_check
|
|
33
33
|
from .configuration_mask2former import Mask2FormerConfig
|
|
34
34
|
|
|
35
35
|
|
|
@@ -939,10 +939,10 @@ class Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
|
|
|
939
939
|
batch_size, num_queries, _ = hidden_states.shape
|
|
940
940
|
batch_size, sequence_length, _ = encoder_hidden_states.shape
|
|
941
941
|
total_elements = sum(height * width for height, width in spatial_shapes_list)
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
942
|
+
torch_compilable_check(
|
|
943
|
+
total_elements == sequence_length,
|
|
944
|
+
"Make sure to align the spatial shapes with the sequence length of the encoder hidden states",
|
|
945
|
+
)
|
|
946
946
|
|
|
947
947
|
value = self.value_proj(encoder_hidden_states)
|
|
948
948
|
if attention_mask is not None:
|
|
@@ -2331,7 +2331,8 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):
|
|
|
2331
2331
|
```python
|
|
2332
2332
|
>>> from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation
|
|
2333
2333
|
>>> from PIL import Image
|
|
2334
|
-
>>> import
|
|
2334
|
+
>>> import httpx
|
|
2335
|
+
>>> from io import BytesIO
|
|
2335
2336
|
>>> import torch
|
|
2336
2337
|
|
|
2337
2338
|
>>> # Load Mask2Former trained on COCO instance segmentation dataset
|
|
@@ -2341,7 +2342,8 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):
|
|
|
2341
2342
|
... )
|
|
2342
2343
|
|
|
2343
2344
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
2344
|
-
>>>
|
|
2345
|
+
>>> with httpx.stream("GET", url) as response:
|
|
2346
|
+
... image = Image.open(BytesIO(response.read()))
|
|
2345
2347
|
>>> inputs = image_processor(image, return_tensors="pt")
|
|
2346
2348
|
|
|
2347
2349
|
>>> with torch.no_grad():
|
|
@@ -2364,7 +2366,8 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):
|
|
|
2364
2366
|
```python
|
|
2365
2367
|
>>> from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation
|
|
2366
2368
|
>>> from PIL import Image
|
|
2367
|
-
>>> import
|
|
2369
|
+
>>> import httpx
|
|
2370
|
+
>>> from io import BytesIO
|
|
2368
2371
|
>>> import torch
|
|
2369
2372
|
|
|
2370
2373
|
>>> # Load Mask2Former trained on ADE20k semantic segmentation dataset
|
|
@@ -2374,7 +2377,8 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):
|
|
|
2374
2377
|
>>> url = (
|
|
2375
2378
|
... "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
|
|
2376
2379
|
... )
|
|
2377
|
-
>>>
|
|
2380
|
+
>>> with httpx.stream("GET", url) as response:
|
|
2381
|
+
... image = Image.open(BytesIO(response.read()))
|
|
2378
2382
|
>>> inputs = image_processor(image, return_tensors="pt")
|
|
2379
2383
|
|
|
2380
2384
|
>>> with torch.no_grad():
|
|
@@ -2398,7 +2402,8 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):
|
|
|
2398
2402
|
```python
|
|
2399
2403
|
>>> from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation
|
|
2400
2404
|
>>> from PIL import Image
|
|
2401
|
-
>>> import
|
|
2405
|
+
>>> import httpx
|
|
2406
|
+
>>> from io import BytesIO
|
|
2402
2407
|
>>> import torch
|
|
2403
2408
|
|
|
2404
2409
|
>>> # Load Mask2Former trained on CityScapes panoptic segmentation dataset
|
|
@@ -2408,7 +2413,8 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):
|
|
|
2408
2413
|
... )
|
|
2409
2414
|
|
|
2410
2415
|
>>> url = "https://cdn-media.huggingface.co/Inference-API/Sample-results-on-the-Cityscapes-dataset-The-above-images-show-how-our-method-can-handle.png"
|
|
2411
|
-
>>>
|
|
2416
|
+
>>> with httpx.stream("GET", url) as response:
|
|
2417
|
+
... image = Image.open(BytesIO(response.read()))
|
|
2412
2418
|
>>> inputs = image_processor(image, return_tensors="pt")
|
|
2413
2419
|
|
|
2414
2420
|
>>> with torch.no_grad():
|
|
@@ -13,12 +13,11 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""MaskFormer model configuration"""
|
|
15
15
|
|
|
16
|
+
from ...backbone_utils import consolidate_backbone_kwargs_to_config
|
|
16
17
|
from ...configuration_utils import PreTrainedConfig
|
|
17
18
|
from ...utils import logging
|
|
18
|
-
from ...utils.backbone_utils import verify_backbone_config_arguments
|
|
19
19
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
|
20
20
|
from ..detr import DetrConfig
|
|
21
|
-
from ..swin import SwinConfig
|
|
22
21
|
|
|
23
22
|
|
|
24
23
|
logger = logging.get_logger(__name__)
|
|
@@ -49,18 +48,6 @@ class MaskFormerConfig(PreTrainedConfig):
|
|
|
49
48
|
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
|
|
50
49
|
The configuration passed to the backbone, if unset, the configuration corresponding to
|
|
51
50
|
`swin-base-patch4-window12-384` will be used.
|
|
52
|
-
backbone (`str`, *optional*):
|
|
53
|
-
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
54
|
-
will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
|
|
55
|
-
is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
|
|
56
|
-
use_pretrained_backbone (`bool`, *optional*, `False`):
|
|
57
|
-
Whether to use pretrained weights for the backbone.
|
|
58
|
-
use_timm_backbone (`bool`, *optional*, `False`):
|
|
59
|
-
Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
|
|
60
|
-
library.
|
|
61
|
-
backbone_kwargs (`dict`, *optional*):
|
|
62
|
-
Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
|
|
63
|
-
e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
|
|
64
51
|
decoder_config (`Dict`, *optional*):
|
|
65
52
|
The configuration passed to the transformer decoder model, if unset the base config for `detr-resnet-50`
|
|
66
53
|
will be used.
|
|
@@ -119,37 +106,23 @@ class MaskFormerConfig(PreTrainedConfig):
|
|
|
119
106
|
cross_entropy_weight: float = 1.0,
|
|
120
107
|
mask_weight: float = 20.0,
|
|
121
108
|
output_auxiliary_logits: bool | None = None,
|
|
122
|
-
backbone: str | None = None,
|
|
123
|
-
use_pretrained_backbone: bool = False,
|
|
124
|
-
use_timm_backbone: bool = False,
|
|
125
|
-
backbone_kwargs: dict | None = None,
|
|
126
109
|
**kwargs,
|
|
127
110
|
):
|
|
128
|
-
|
|
129
|
-
# fall back to https://huggingface.co/microsoft/swin-base-patch4-window12-384-in22k
|
|
130
|
-
backbone_config = SwinConfig(
|
|
131
|
-
image_size=384,
|
|
132
|
-
num_channels=3,
|
|
133
|
-
patch_size=4,
|
|
134
|
-
embed_dim=128,
|
|
135
|
-
depths=[2, 2, 18, 2],
|
|
136
|
-
num_heads=[4, 8, 16, 32],
|
|
137
|
-
window_size=12,
|
|
138
|
-
drop_path_rate=0.3,
|
|
139
|
-
out_features=["stage1", "stage2", "stage3", "stage4"],
|
|
140
|
-
)
|
|
141
|
-
elif isinstance(backbone_config, dict):
|
|
142
|
-
backbone_model_type = backbone_config.pop("model_type")
|
|
143
|
-
config_class = CONFIG_MAPPING[backbone_model_type]
|
|
144
|
-
backbone_config = config_class.from_dict(backbone_config)
|
|
145
|
-
|
|
146
|
-
verify_backbone_config_arguments(
|
|
147
|
-
use_timm_backbone=use_timm_backbone,
|
|
148
|
-
use_pretrained_backbone=use_pretrained_backbone,
|
|
149
|
-
backbone=backbone,
|
|
111
|
+
backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
|
|
150
112
|
backbone_config=backbone_config,
|
|
151
|
-
|
|
113
|
+
default_config_type="swin",
|
|
114
|
+
default_config_kwargs={
|
|
115
|
+
"depths": [2, 2, 18, 2],
|
|
116
|
+
"drop_path_rate": 0.3,
|
|
117
|
+
"image_size": 384,
|
|
118
|
+
"embed_dim": 128,
|
|
119
|
+
"num_heads": [4, 8, 16, 32],
|
|
120
|
+
"window_size": 12,
|
|
121
|
+
"out_features": ["stage1", "stage2", "stage3", "stage4"],
|
|
122
|
+
},
|
|
123
|
+
**kwargs,
|
|
152
124
|
)
|
|
125
|
+
|
|
153
126
|
# verify that the backbone is supported
|
|
154
127
|
if backbone_config is not None and backbone_config.model_type not in self.backbones_supported:
|
|
155
128
|
logger.warning_once(
|
|
@@ -192,10 +165,6 @@ class MaskFormerConfig(PreTrainedConfig):
|
|
|
192
165
|
|
|
193
166
|
self.num_attention_heads = self.decoder_config.encoder_attention_heads
|
|
194
167
|
self.num_hidden_layers = self.decoder_config.num_hidden_layers
|
|
195
|
-
self.backbone = backbone
|
|
196
|
-
self.use_pretrained_backbone = use_pretrained_backbone
|
|
197
|
-
self.use_timm_backbone = use_timm_backbone
|
|
198
|
-
self.backbone_kwargs = backbone_kwargs
|
|
199
168
|
super().__init__(**kwargs)
|
|
200
169
|
|
|
201
170
|
|
|
@@ -13,9 +13,9 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""MaskFormer Swin Transformer model configuration"""
|
|
15
15
|
|
|
16
|
+
from ...backbone_utils import BackboneConfigMixin
|
|
16
17
|
from ...configuration_utils import PreTrainedConfig
|
|
17
18
|
from ...utils import logging
|
|
18
|
-
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
logger = logging.get_logger(__name__)
|
|
@@ -144,9 +144,7 @@ class MaskFormerSwinConfig(BackboneConfigMixin, PreTrainedConfig):
|
|
|
144
144
|
# this indicates the channel dimension after the last stage of the model
|
|
145
145
|
self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
|
|
146
146
|
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
|
|
147
|
-
self.
|
|
148
|
-
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
|
|
149
|
-
)
|
|
147
|
+
self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features)
|
|
150
148
|
|
|
151
149
|
|
|
152
150
|
__all__ = ["MaskFormerSwinConfig"]
|
|
@@ -17,8 +17,8 @@ import math
|
|
|
17
17
|
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
18
18
|
|
|
19
19
|
import torch
|
|
20
|
+
import torchvision.transforms.v2.functional as tvF
|
|
20
21
|
from torch import nn
|
|
21
|
-
from torchvision.transforms.v2 import functional as F
|
|
22
22
|
|
|
23
23
|
from transformers.image_transforms import get_size_with_aspect_ratio
|
|
24
24
|
|
|
@@ -147,7 +147,7 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
147
147
|
image: torch.Tensor,
|
|
148
148
|
size: SizeDict,
|
|
149
149
|
size_divisor: int = 0,
|
|
150
|
-
interpolation: Optional["
|
|
150
|
+
interpolation: Optional["tvF.InterpolationMode"] = None,
|
|
151
151
|
**kwargs,
|
|
152
152
|
) -> torch.Tensor:
|
|
153
153
|
"""
|
|
@@ -172,7 +172,7 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
172
172
|
interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
|
|
173
173
|
Resampling filter to use if resizing the image.
|
|
174
174
|
"""
|
|
175
|
-
interpolation = interpolation if interpolation is not None else
|
|
175
|
+
interpolation = interpolation if interpolation is not None else tvF.InterpolationMode.BILINEAR
|
|
176
176
|
if size.shortest_edge and size.longest_edge:
|
|
177
177
|
# Resize the image so that the shortest edge or the longest edge is of the given size
|
|
178
178
|
# while maintaining the aspect ratio of the original image.
|
|
@@ -196,7 +196,7 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
196
196
|
width = int(math.ceil(width / size_divisor) * size_divisor)
|
|
197
197
|
new_size = (height, width)
|
|
198
198
|
|
|
199
|
-
image =
|
|
199
|
+
image = tvF.resize(
|
|
200
200
|
image,
|
|
201
201
|
size=new_size,
|
|
202
202
|
interpolation=interpolation,
|
|
@@ -222,9 +222,9 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
222
222
|
)
|
|
223
223
|
if original_size != padded_size:
|
|
224
224
|
padding = [0, 0, padding_right, padding_bottom]
|
|
225
|
-
images =
|
|
225
|
+
images = tvF.pad(images, padding, fill=fill)
|
|
226
226
|
if segmentation_maps is not None:
|
|
227
|
-
segmentation_maps = [
|
|
227
|
+
segmentation_maps = [tvF.pad(mask, padding, fill=ignore_index) for mask in segmentation_maps]
|
|
228
228
|
|
|
229
229
|
# Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
|
|
230
230
|
pixel_mask = torch.zeros((images.shape[0], *padded_size), dtype=torch.int64, device=images.device)
|
|
@@ -290,7 +290,7 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
290
290
|
size: SizeDict | None,
|
|
291
291
|
pad_size: SizeDict | None,
|
|
292
292
|
size_divisor: int | None,
|
|
293
|
-
interpolation: Union["PILImageResampling", "
|
|
293
|
+
interpolation: Union["PILImageResampling", "tvF.InterpolationMode"] | None,
|
|
294
294
|
do_rescale: bool | None,
|
|
295
295
|
rescale_factor: float | None,
|
|
296
296
|
do_normalize: bool | None,
|
|
@@ -325,7 +325,7 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
325
325
|
image=stacked_segmentation_maps,
|
|
326
326
|
size=size,
|
|
327
327
|
size_divisor=size_divisor,
|
|
328
|
-
interpolation=
|
|
328
|
+
interpolation=tvF.InterpolationMode.NEAREST_EXACT,
|
|
329
329
|
)
|
|
330
330
|
resized_images_grouped[shape] = stacked_images
|
|
331
331
|
if segmentation_maps is not None:
|