transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -36,7 +36,7 @@ from ...modeling_rope_utils import (
|
|
|
36
36
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
37
37
|
from ...processing_utils import Unpack
|
|
38
38
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
|
|
39
|
-
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
|
|
39
|
+
from ...utils.generic import OutputRecorder, check_model_inputs, is_flash_attention_requested, maybe_autocast
|
|
40
40
|
from .configuration_mllama import MllamaConfig, MllamaTextConfig, MllamaVisionConfig
|
|
41
41
|
|
|
42
42
|
|
|
@@ -252,10 +252,9 @@ class MllamaVisionAttention(nn.Module):
|
|
|
252
252
|
key = key.view(batch_size, kv_seq_len, self.num_heads, self.head_dim).transpose(1, 2)
|
|
253
253
|
value = value.view(batch_size, kv_seq_len, self.num_heads, self.head_dim).transpose(1, 2)
|
|
254
254
|
|
|
255
|
-
attention_interface: Callable =
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
255
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
256
|
+
self.config._attn_implementation, eager_attention_forward
|
|
257
|
+
)
|
|
259
258
|
|
|
260
259
|
attn_output, attn_weights = attention_interface(
|
|
261
260
|
self,
|
|
@@ -451,10 +450,9 @@ class MllamaTextCrossAttention(nn.Module):
|
|
|
451
450
|
"Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!"
|
|
452
451
|
)
|
|
453
452
|
|
|
454
|
-
attention_interface: Callable =
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
453
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
454
|
+
self.config._attn_implementation, eager_attention_forward
|
|
455
|
+
)
|
|
458
456
|
|
|
459
457
|
attn_output, attn_weights = attention_interface(
|
|
460
458
|
self,
|
|
@@ -554,10 +552,9 @@ class MllamaTextSelfAttention(nn.Module):
|
|
|
554
552
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
|
555
553
|
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
|
556
554
|
|
|
557
|
-
attention_interface: Callable =
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
555
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
556
|
+
self.config._attn_implementation, eager_attention_forward
|
|
557
|
+
)
|
|
561
558
|
|
|
562
559
|
attn_output, attn_weights = attention_interface(
|
|
563
560
|
self,
|
|
@@ -863,7 +860,7 @@ class MllamaPreTrainedModel(PreTrainedModel):
|
|
|
863
860
|
past_key_values: Cache,
|
|
864
861
|
output_attentions: bool = False,
|
|
865
862
|
):
|
|
866
|
-
if self.config
|
|
863
|
+
if is_flash_attention_requested(self.config):
|
|
867
864
|
if attention_mask is not None and (attention_mask == 0.0).any():
|
|
868
865
|
return attention_mask
|
|
869
866
|
return None
|
|
@@ -1067,7 +1064,8 @@ class MllamaVisionModel(MllamaPreTrainedModel):
|
|
|
1067
1064
|
|
|
1068
1065
|
```python
|
|
1069
1066
|
>>> from PIL import Image
|
|
1070
|
-
>>> import
|
|
1067
|
+
>>> import httpx
|
|
1068
|
+
>>> from io import BytesIO
|
|
1071
1069
|
>>> from transformers import AutoProcessor, MllamaVisionModel
|
|
1072
1070
|
|
|
1073
1071
|
>>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
|
|
@@ -1075,7 +1073,8 @@ class MllamaVisionModel(MllamaPreTrainedModel):
|
|
|
1075
1073
|
>>> processor = AutoProcessor.from_pretrained(checkpoint)
|
|
1076
1074
|
|
|
1077
1075
|
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
|
|
1078
|
-
>>>
|
|
1076
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1077
|
+
... image = Image.open(BytesIO(response.read()))
|
|
1079
1078
|
>>> inputs = processor(images=image, return_tensors="pt")
|
|
1080
1079
|
|
|
1081
1080
|
>>> output = model(**inputs)
|
|
@@ -1454,7 +1453,6 @@ class MllamaModel(MllamaPreTrainedModel):
|
|
|
1454
1453
|
self.hidden_size = config.text_config.hidden_size
|
|
1455
1454
|
self.max_num_tiles = config.vision_config.max_num_tiles
|
|
1456
1455
|
self.vision_output_dim = config.vision_config.vision_output_dim
|
|
1457
|
-
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
|
|
1458
1456
|
|
|
1459
1457
|
self.vision_model = MllamaVisionModel._from_config(config.vision_config)
|
|
1460
1458
|
self.language_model = MllamaTextModel._from_config(config.text_config)
|
|
@@ -1657,7 +1655,8 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
|
|
|
1657
1655
|
|
|
1658
1656
|
```python
|
|
1659
1657
|
>>> from PIL import Image
|
|
1660
|
-
>>> import
|
|
1658
|
+
>>> import httpx
|
|
1659
|
+
>>> from io import BytesIO
|
|
1661
1660
|
>>> from transformers import AutoProcessor, MllamaForConditionalGeneration
|
|
1662
1661
|
|
|
1663
1662
|
>>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
|
|
@@ -1666,7 +1665,8 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
|
|
|
1666
1665
|
|
|
1667
1666
|
>>> prompt = "<|image|>If I had to write a haiku for this one"
|
|
1668
1667
|
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
|
|
1669
|
-
>>>
|
|
1668
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1669
|
+
... image = Image.open(BytesIO(response.read()))
|
|
1670
1670
|
|
|
1671
1671
|
>>> inputs = processor(text=prompt, images=image, return_tensors="pt")
|
|
1672
1672
|
|
|
@@ -17,9 +17,9 @@
|
|
|
17
17
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
18
18
|
# See the License for the specific language governing permissions and
|
|
19
19
|
# limitations under the License.
|
|
20
|
+
from ...backbone_utils import consolidate_backbone_kwargs_to_config
|
|
20
21
|
from ...configuration_utils import PreTrainedConfig
|
|
21
22
|
from ...utils import logging
|
|
22
|
-
from ...utils.backbone_utils import verify_backbone_config_arguments
|
|
23
23
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
|
24
24
|
|
|
25
25
|
|
|
@@ -39,18 +39,6 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
39
39
|
Args:
|
|
40
40
|
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
|
|
41
41
|
The configuration of the backbone model.
|
|
42
|
-
backbone (`str`, *optional*):
|
|
43
|
-
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
44
|
-
will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
|
|
45
|
-
is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
|
|
46
|
-
use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
|
|
47
|
-
Whether to use pretrained weights for the backbone.
|
|
48
|
-
use_timm_backbone (`bool`, *optional*, defaults to `False`):
|
|
49
|
-
Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
|
|
50
|
-
library.
|
|
51
|
-
backbone_kwargs (`dict`, *optional*):
|
|
52
|
-
Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
|
|
53
|
-
e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
|
|
54
42
|
text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `BertConfig`):
|
|
55
43
|
The config object or dictionary of the text backbone.
|
|
56
44
|
num_queries (`int`, *optional*, defaults to 900):
|
|
@@ -127,6 +115,8 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
127
115
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
128
116
|
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
|
129
117
|
The epsilon used by the layer normalization layers.
|
|
118
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
|
119
|
+
Whether to tie weight embeddings
|
|
130
120
|
|
|
131
121
|
Examples:
|
|
132
122
|
|
|
@@ -153,10 +143,6 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
153
143
|
def __init__(
|
|
154
144
|
self,
|
|
155
145
|
backbone_config=None,
|
|
156
|
-
backbone=None,
|
|
157
|
-
use_pretrained_backbone=False,
|
|
158
|
-
use_timm_backbone=False,
|
|
159
|
-
backbone_kwargs=None,
|
|
160
146
|
text_config=None,
|
|
161
147
|
num_queries=900,
|
|
162
148
|
encoder_layers=6,
|
|
@@ -194,40 +180,17 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
194
180
|
positional_embedding_temperature=20,
|
|
195
181
|
init_std=0.02,
|
|
196
182
|
layer_norm_eps=1e-5,
|
|
183
|
+
tie_word_embeddings=True,
|
|
197
184
|
**kwargs,
|
|
198
185
|
):
|
|
199
|
-
|
|
200
|
-
logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.")
|
|
201
|
-
backbone_config = CONFIG_MAPPING["swin"](
|
|
202
|
-
window_size=7,
|
|
203
|
-
image_size=224,
|
|
204
|
-
embed_dim=96,
|
|
205
|
-
depths=[2, 2, 6, 2],
|
|
206
|
-
num_heads=[3, 6, 12, 24],
|
|
207
|
-
out_indices=[2, 3, 4],
|
|
208
|
-
)
|
|
209
|
-
elif isinstance(backbone_config, dict):
|
|
210
|
-
backbone_model_type = backbone_config.pop("model_type")
|
|
211
|
-
config_class = CONFIG_MAPPING[backbone_model_type]
|
|
212
|
-
backbone_config = config_class.from_dict(backbone_config)
|
|
213
|
-
|
|
214
|
-
verify_backbone_config_arguments(
|
|
215
|
-
use_timm_backbone=use_timm_backbone,
|
|
216
|
-
use_pretrained_backbone=use_pretrained_backbone,
|
|
217
|
-
backbone=backbone,
|
|
186
|
+
backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
|
|
218
187
|
backbone_config=backbone_config,
|
|
219
|
-
|
|
188
|
+
default_config_type="swin",
|
|
189
|
+
default_config_kwargs={"out_indices": [2, 3, 4]},
|
|
190
|
+
**kwargs,
|
|
220
191
|
)
|
|
221
192
|
|
|
222
|
-
if text_config is None:
|
|
223
|
-
text_config = {}
|
|
224
|
-
logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).")
|
|
225
|
-
|
|
226
193
|
self.backbone_config = backbone_config
|
|
227
|
-
self.backbone = backbone
|
|
228
|
-
self.use_pretrained_backbone = use_pretrained_backbone
|
|
229
|
-
self.use_timm_backbone = use_timm_backbone
|
|
230
|
-
self.backbone_kwargs = backbone_kwargs
|
|
231
194
|
self.num_queries = num_queries
|
|
232
195
|
self.d_model = d_model
|
|
233
196
|
self.encoder_ffn_dim = encoder_ffn_dim
|
|
@@ -261,6 +224,7 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
261
224
|
text_config["model_type"] = text_config.get("model_type", "bert")
|
|
262
225
|
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
|
|
263
226
|
elif text_config is None:
|
|
227
|
+
logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).")
|
|
264
228
|
text_config = CONFIG_MAPPING["bert"]()
|
|
265
229
|
|
|
266
230
|
self.text_config = text_config
|
|
@@ -277,6 +241,7 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
277
241
|
self.positional_embedding_temperature = positional_embedding_temperature
|
|
278
242
|
self.init_std = init_std
|
|
279
243
|
self.layer_norm_eps = layer_norm_eps
|
|
244
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
280
245
|
|
|
281
246
|
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
|
|
282
247
|
|
|
@@ -27,20 +27,16 @@ from torch import Tensor, nn
|
|
|
27
27
|
|
|
28
28
|
from ... import initialization as init
|
|
29
29
|
from ...activations import ACT2FN
|
|
30
|
-
from ...
|
|
30
|
+
from ...backbone_utils import load_backbone
|
|
31
|
+
from ...file_utils import ModelOutput
|
|
31
32
|
from ...integrations import use_kernel_forward_from_hub
|
|
32
33
|
from ...modeling_utils import PreTrainedModel
|
|
33
34
|
from ...pytorch_utils import meshgrid
|
|
34
|
-
from ...utils import auto_docstring
|
|
35
|
-
from ...utils.backbone_utils import load_backbone
|
|
35
|
+
from ...utils import auto_docstring, torch_compilable_check
|
|
36
36
|
from ..auto.modeling_auto import AutoModel
|
|
37
37
|
from .configuration_mm_grounding_dino import MMGroundingDinoConfig
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
if is_timm_available():
|
|
41
|
-
from timm import create_model
|
|
42
|
-
|
|
43
|
-
|
|
44
40
|
class MMGroundingDinoContrastiveEmbedding(nn.Module):
|
|
45
41
|
def __init__(self, config):
|
|
46
42
|
super().__init__()
|
|
@@ -182,9 +178,6 @@ class MMGroundingDinoMultiscaleDeformableAttention(nn.Module):
|
|
|
182
178
|
|
|
183
179
|
self.disable_custom_kernels = config.disable_custom_kernels
|
|
184
180
|
|
|
185
|
-
def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Tensor | None):
|
|
186
|
-
return tensor if position_embeddings is None else tensor + position_embeddings
|
|
187
|
-
|
|
188
181
|
def forward(
|
|
189
182
|
self,
|
|
190
183
|
hidden_states: torch.Tensor,
|
|
@@ -200,15 +193,15 @@ class MMGroundingDinoMultiscaleDeformableAttention(nn.Module):
|
|
|
200
193
|
):
|
|
201
194
|
# add position embeddings to the hidden states before projecting to queries and keys
|
|
202
195
|
if position_embeddings is not None:
|
|
203
|
-
hidden_states =
|
|
196
|
+
hidden_states = hidden_states + position_embeddings
|
|
204
197
|
|
|
205
198
|
batch_size, num_queries, _ = hidden_states.shape
|
|
206
199
|
batch_size, sequence_length, _ = encoder_hidden_states.shape
|
|
207
200
|
# Ignore copy
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
201
|
+
torch_compilable_check(
|
|
202
|
+
(spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == sequence_length,
|
|
203
|
+
"Make sure to align the spatial shapes with the sequence length of the encoder hidden states",
|
|
204
|
+
)
|
|
212
205
|
|
|
213
206
|
value = self.value_proj(encoder_hidden_states)
|
|
214
207
|
if attention_mask is not None:
|
|
@@ -654,46 +647,23 @@ class MMGroundingDinoConvEncoder(nn.Module):
|
|
|
654
647
|
super().__init__()
|
|
655
648
|
|
|
656
649
|
self.config = config
|
|
657
|
-
|
|
658
|
-
if config.use_timm_backbone:
|
|
659
|
-
requires_backends(self, ["timm"])
|
|
660
|
-
backbone = create_model(
|
|
661
|
-
config.backbone,
|
|
662
|
-
pretrained=config.use_pretrained_backbone,
|
|
663
|
-
features_only=True,
|
|
664
|
-
**config.backbone_kwargs,
|
|
665
|
-
)
|
|
666
|
-
else:
|
|
667
|
-
backbone = load_backbone(config)
|
|
650
|
+
backbone = load_backbone(config)
|
|
668
651
|
|
|
669
652
|
# replace batch norm by frozen batch norm
|
|
670
653
|
with torch.no_grad():
|
|
671
654
|
replace_batch_norm(backbone)
|
|
672
655
|
self.model = backbone
|
|
673
|
-
self.intermediate_channel_sizes =
|
|
674
|
-
self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
|
|
675
|
-
)
|
|
676
|
-
|
|
677
|
-
backbone_model_type = None
|
|
678
|
-
if config.backbone is not None:
|
|
679
|
-
backbone_model_type = config.backbone
|
|
680
|
-
elif config.backbone_config is not None:
|
|
681
|
-
backbone_model_type = config.backbone_config.model_type
|
|
682
|
-
else:
|
|
683
|
-
raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
|
|
656
|
+
self.intermediate_channel_sizes = self.model.channels
|
|
684
657
|
|
|
658
|
+
backbone_model_type = config.backbone_config.model_type
|
|
685
659
|
if "resnet" in backbone_model_type:
|
|
686
660
|
for name, parameter in self.model.named_parameters():
|
|
687
|
-
if
|
|
688
|
-
|
|
689
|
-
parameter.requires_grad_(False)
|
|
690
|
-
else:
|
|
691
|
-
if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
|
|
692
|
-
parameter.requires_grad_(False)
|
|
661
|
+
if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
|
|
662
|
+
parameter.requires_grad_(False)
|
|
693
663
|
|
|
694
664
|
def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
|
|
695
665
|
# send pixel_values through the model to get list of feature maps
|
|
696
|
-
features = self.model(pixel_values
|
|
666
|
+
features = self.model(pixel_values, return_dict=True).feature_maps
|
|
697
667
|
|
|
698
668
|
out = []
|
|
699
669
|
for feature_map in features:
|
|
@@ -703,6 +673,7 @@ class MMGroundingDinoConvEncoder(nn.Module):
|
|
|
703
673
|
return out
|
|
704
674
|
|
|
705
675
|
|
|
676
|
+
# TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->MMGroundingDino
|
|
706
677
|
class MMGroundingDinoConvModel(nn.Module):
|
|
707
678
|
"""
|
|
708
679
|
This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
|
|
@@ -1131,12 +1102,12 @@ class MMGroundingDinoEncoder(MMGroundingDinoPreTrainedModel):
|
|
|
1131
1102
|
self.post_init()
|
|
1132
1103
|
|
|
1133
1104
|
@staticmethod
|
|
1134
|
-
def get_reference_points(
|
|
1105
|
+
def get_reference_points(spatial_shapes_list, valid_ratios, device):
|
|
1135
1106
|
"""
|
|
1136
1107
|
Get reference points for each feature map.
|
|
1137
1108
|
|
|
1138
1109
|
Args:
|
|
1139
|
-
|
|
1110
|
+
spatial_shapes_list (`list[tuple[int, int]]`):
|
|
1140
1111
|
Spatial shapes of each feature map.
|
|
1141
1112
|
valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
|
|
1142
1113
|
Valid ratios of each feature map.
|
|
@@ -1146,7 +1117,7 @@ class MMGroundingDinoEncoder(MMGroundingDinoPreTrainedModel):
|
|
|
1146
1117
|
`torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
|
|
1147
1118
|
"""
|
|
1148
1119
|
reference_points_list = []
|
|
1149
|
-
for level, (height, width) in enumerate(
|
|
1120
|
+
for level, (height, width) in enumerate(spatial_shapes_list):
|
|
1150
1121
|
ref_y, ref_x = meshgrid(
|
|
1151
1122
|
torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
|
|
1152
1123
|
torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
|
|
@@ -1229,7 +1200,7 @@ class MMGroundingDinoEncoder(MMGroundingDinoPreTrainedModel):
|
|
|
1229
1200
|
)
|
|
1230
1201
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
1231
1202
|
|
|
1232
|
-
reference_points = self.get_reference_points(
|
|
1203
|
+
reference_points = self.get_reference_points(spatial_shapes_list, valid_ratios, device=vision_features.device)
|
|
1233
1204
|
|
|
1234
1205
|
encoder_vision_states = () if output_hidden_states else None
|
|
1235
1206
|
encoder_text_states = () if output_hidden_states else None
|
|
@@ -1783,33 +1754,42 @@ def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTen
|
|
|
1783
1754
|
- **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`)
|
|
1784
1755
|
- **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`)
|
|
1785
1756
|
"""
|
|
1786
|
-
batch_size,
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
#
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
# generate attention mask and positional ids
|
|
1796
|
-
attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1)
|
|
1797
|
-
position_ids = torch.zeros((batch_size, num_token), device=input_ids.device)
|
|
1798
|
-
previous_col = 0
|
|
1799
|
-
for i in range(idxs.shape[0]):
|
|
1800
|
-
row, col = idxs[i]
|
|
1801
|
-
if (col == 0) or (col == num_token - 1):
|
|
1802
|
-
attention_mask[row, col, col] = True
|
|
1803
|
-
position_ids[row, col] = 0
|
|
1804
|
-
else:
|
|
1805
|
-
attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
|
|
1806
|
-
position_ids[row, previous_col + 1 : col + 1] = torch.arange(
|
|
1807
|
-
0, col - previous_col, device=input_ids.device
|
|
1808
|
-
)
|
|
1757
|
+
batch_size, seq_len = input_ids.shape
|
|
1758
|
+
device = input_ids.device
|
|
1759
|
+
|
|
1760
|
+
# Identify special token positions
|
|
1761
|
+
special_mask = torch.isin(input_ids, torch.tensor(SPECIAL_TOKENS, device=device))
|
|
1762
|
+
|
|
1763
|
+
# For each position, find the previous and next special token indices
|
|
1764
|
+
indices = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1)
|
|
1809
1765
|
|
|
1810
|
-
|
|
1766
|
+
# Previous special token: cummax of special token indices
|
|
1767
|
+
prev_special = torch.where(special_mask, indices, torch.tensor(-1, device=device))
|
|
1768
|
+
prev_special = torch.cummax(prev_special, dim=1)[0]
|
|
1811
1769
|
|
|
1812
|
-
|
|
1770
|
+
# Next special token: flip, cummin, flip back
|
|
1771
|
+
next_special = torch.where(special_mask, indices, torch.tensor(seq_len, device=device))
|
|
1772
|
+
next_special = torch.flip(torch.cummin(torch.flip(next_special, dims=[1]), dim=1)[0], dims=[1])
|
|
1773
|
+
|
|
1774
|
+
# Tokens with the same next_special belong to the same block
|
|
1775
|
+
# Exclude blocks whose closing delimiter is at position 0 or seq_len-1
|
|
1776
|
+
valid_block = (next_special != 0) & (next_special != seq_len - 1) & (next_special != seq_len)
|
|
1777
|
+
|
|
1778
|
+
# Build attention mask: tokens attend to each other if they share the same next_special
|
|
1779
|
+
next_i = next_special.unsqueeze(2) # (B, N, 1)
|
|
1780
|
+
next_j = next_special.unsqueeze(1) # (B, 1, N)
|
|
1781
|
+
attention_mask = (next_i == next_j) & valid_block.unsqueeze(1)
|
|
1782
|
+
|
|
1783
|
+
# Always allow self-attention
|
|
1784
|
+
identity = torch.eye(seq_len, device=device, dtype=torch.bool).unsqueeze(0).expand(batch_size, -1, -1)
|
|
1785
|
+
attention_mask = identity | attention_mask
|
|
1786
|
+
|
|
1787
|
+
# Position IDs: distance from previous special token
|
|
1788
|
+
position_ids = indices - prev_special - 1
|
|
1789
|
+
position_ids = torch.where(valid_block, position_ids, torch.zeros_like(position_ids))
|
|
1790
|
+
position_ids = torch.clamp(position_ids, min=0).to(torch.long)
|
|
1791
|
+
|
|
1792
|
+
return attention_mask, position_ids
|
|
1813
1793
|
|
|
1814
1794
|
|
|
1815
1795
|
@auto_docstring(
|
|
@@ -1888,13 +1868,13 @@ class MMGroundingDinoModel(MMGroundingDinoPreTrainedModel):
|
|
|
1888
1868
|
valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
|
|
1889
1869
|
return valid_ratio
|
|
1890
1870
|
|
|
1891
|
-
def generate_encoder_output_proposals(self, enc_output, padding_mask,
|
|
1871
|
+
def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes_list):
|
|
1892
1872
|
"""Generate the encoder output proposals from encoded enc_output.
|
|
1893
1873
|
|
|
1894
1874
|
Args:
|
|
1895
1875
|
enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder.
|
|
1896
1876
|
padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`.
|
|
1897
|
-
|
|
1877
|
+
spatial_shapes_list (`list[tuple[int, int]]`): Spatial shapes of each feature map.
|
|
1898
1878
|
|
|
1899
1879
|
Returns:
|
|
1900
1880
|
`tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
|
|
@@ -1906,7 +1886,7 @@ class MMGroundingDinoModel(MMGroundingDinoPreTrainedModel):
|
|
|
1906
1886
|
batch_size = enc_output.shape[0]
|
|
1907
1887
|
proposals = []
|
|
1908
1888
|
current_position = 0
|
|
1909
|
-
for level, (height, width) in enumerate(
|
|
1889
|
+
for level, (height, width) in enumerate(spatial_shapes_list):
|
|
1910
1890
|
mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)]
|
|
1911
1891
|
mask_flatten_ = mask_flatten_.view(batch_size, height, width, 1)
|
|
1912
1892
|
valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
|
|
@@ -1970,10 +1950,12 @@ class MMGroundingDinoModel(MMGroundingDinoPreTrainedModel):
|
|
|
1970
1950
|
```python
|
|
1971
1951
|
>>> from transformers import AutoProcessor, AutoModel
|
|
1972
1952
|
>>> from PIL import Image
|
|
1973
|
-
>>> import
|
|
1953
|
+
>>> import httpx
|
|
1954
|
+
>>> from io import BytesIO
|
|
1974
1955
|
|
|
1975
1956
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
1976
|
-
>>>
|
|
1957
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1958
|
+
... image = Image.open(BytesIO(response.read()))
|
|
1977
1959
|
>>> text = "a cat."
|
|
1978
1960
|
|
|
1979
1961
|
>>> processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
|
|
@@ -2121,7 +2103,7 @@ class MMGroundingDinoModel(MMGroundingDinoPreTrainedModel):
|
|
|
2121
2103
|
encoder_pred_boxes = None
|
|
2122
2104
|
if self.config.two_stage:
|
|
2123
2105
|
object_query_embedding, output_proposals = self.generate_encoder_output_proposals(
|
|
2124
|
-
encoder_outputs[0], ~mask_flatten,
|
|
2106
|
+
encoder_outputs[0], ~mask_flatten, spatial_shapes_list
|
|
2125
2107
|
)
|
|
2126
2108
|
|
|
2127
2109
|
# hack implementation as in two-stage Deformable DETR
|
|
@@ -2218,8 +2200,6 @@ class MMGroundingDinoMLPPredictionHead(nn.Module):
|
|
|
2218
2200
|
Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
|
|
2219
2201
|
height and width of a bounding box w.r.t. an image.
|
|
2220
2202
|
|
|
2221
|
-
Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
|
|
2222
|
-
|
|
2223
2203
|
"""
|
|
2224
2204
|
|
|
2225
2205
|
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
|
|
@@ -2454,7 +2434,8 @@ class MMGroundingDinoForObjectDetection(MMGroundingDinoPreTrainedModel):
|
|
|
2454
2434
|
Examples:
|
|
2455
2435
|
|
|
2456
2436
|
```python
|
|
2457
|
-
>>> import
|
|
2437
|
+
>>> import httpx
|
|
2438
|
+
>>> from io import BytesIO
|
|
2458
2439
|
|
|
2459
2440
|
>>> import torch
|
|
2460
2441
|
>>> from PIL import Image
|
|
@@ -2466,8 +2447,9 @@ class MMGroundingDinoForObjectDetection(MMGroundingDinoPreTrainedModel):
|
|
|
2466
2447
|
>>> processor = AutoProcessor.from_pretrained(model_id)
|
|
2467
2448
|
>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
|
|
2468
2449
|
|
|
2469
|
-
>>>
|
|
2470
|
-
>>>
|
|
2450
|
+
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
2451
|
+
>>> with httpx.stream("GET", url) as response:
|
|
2452
|
+
... image = Image.open(BytesIO(response.read()))
|
|
2471
2453
|
>>> # Check for cats and remote controls
|
|
2472
2454
|
>>> text_labels = [["a cat", "a remote control"]]
|
|
2473
2455
|
|
|
@@ -17,9 +17,9 @@ import torch
|
|
|
17
17
|
from torch import nn
|
|
18
18
|
|
|
19
19
|
from ... import initialization as init
|
|
20
|
+
from ...backbone_utils import consolidate_backbone_kwargs_to_config
|
|
20
21
|
from ...configuration_utils import PreTrainedConfig
|
|
21
22
|
from ...utils import logging
|
|
22
|
-
from ...utils.backbone_utils import verify_backbone_config_arguments
|
|
23
23
|
from ..auto import CONFIG_MAPPING, AutoConfig
|
|
24
24
|
from ..auto.modeling_auto import AutoModel
|
|
25
25
|
from ..grounding_dino.modeling_grounding_dino import (
|
|
@@ -52,18 +52,6 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
52
52
|
Args:
|
|
53
53
|
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
|
|
54
54
|
The configuration of the backbone model.
|
|
55
|
-
backbone (`str`, *optional*):
|
|
56
|
-
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
57
|
-
will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
|
|
58
|
-
is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
|
|
59
|
-
use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
|
|
60
|
-
Whether to use pretrained weights for the backbone.
|
|
61
|
-
use_timm_backbone (`bool`, *optional*, defaults to `False`):
|
|
62
|
-
Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
|
|
63
|
-
library.
|
|
64
|
-
backbone_kwargs (`dict`, *optional*):
|
|
65
|
-
Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
|
|
66
|
-
e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
|
|
67
55
|
text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `BertConfig`):
|
|
68
56
|
The config object or dictionary of the text backbone.
|
|
69
57
|
num_queries (`int`, *optional*, defaults to 900):
|
|
@@ -140,6 +128,8 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
140
128
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
141
129
|
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
|
142
130
|
The epsilon used by the layer normalization layers.
|
|
131
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
|
132
|
+
Whether to tie weight embeddings
|
|
143
133
|
|
|
144
134
|
Examples:
|
|
145
135
|
|
|
@@ -166,10 +156,6 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
166
156
|
def __init__(
|
|
167
157
|
self,
|
|
168
158
|
backbone_config=None,
|
|
169
|
-
backbone=None,
|
|
170
|
-
use_pretrained_backbone=False,
|
|
171
|
-
use_timm_backbone=False,
|
|
172
|
-
backbone_kwargs=None,
|
|
173
159
|
text_config=None,
|
|
174
160
|
num_queries=900,
|
|
175
161
|
encoder_layers=6,
|
|
@@ -207,40 +193,17 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
207
193
|
positional_embedding_temperature=20,
|
|
208
194
|
init_std=0.02,
|
|
209
195
|
layer_norm_eps=1e-5,
|
|
196
|
+
tie_word_embeddings=True,
|
|
210
197
|
**kwargs,
|
|
211
198
|
):
|
|
212
|
-
|
|
213
|
-
logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.")
|
|
214
|
-
backbone_config = CONFIG_MAPPING["swin"](
|
|
215
|
-
window_size=7,
|
|
216
|
-
image_size=224,
|
|
217
|
-
embed_dim=96,
|
|
218
|
-
depths=[2, 2, 6, 2],
|
|
219
|
-
num_heads=[3, 6, 12, 24],
|
|
220
|
-
out_indices=[2, 3, 4],
|
|
221
|
-
)
|
|
222
|
-
elif isinstance(backbone_config, dict):
|
|
223
|
-
backbone_model_type = backbone_config.pop("model_type")
|
|
224
|
-
config_class = CONFIG_MAPPING[backbone_model_type]
|
|
225
|
-
backbone_config = config_class.from_dict(backbone_config)
|
|
226
|
-
|
|
227
|
-
verify_backbone_config_arguments(
|
|
228
|
-
use_timm_backbone=use_timm_backbone,
|
|
229
|
-
use_pretrained_backbone=use_pretrained_backbone,
|
|
230
|
-
backbone=backbone,
|
|
199
|
+
backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
|
|
231
200
|
backbone_config=backbone_config,
|
|
232
|
-
|
|
201
|
+
default_config_type="swin",
|
|
202
|
+
default_config_kwargs={"out_indices": [2, 3, 4]},
|
|
203
|
+
**kwargs,
|
|
233
204
|
)
|
|
234
205
|
|
|
235
|
-
if text_config is None:
|
|
236
|
-
text_config = {}
|
|
237
|
-
logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).")
|
|
238
|
-
|
|
239
206
|
self.backbone_config = backbone_config
|
|
240
|
-
self.backbone = backbone
|
|
241
|
-
self.use_pretrained_backbone = use_pretrained_backbone
|
|
242
|
-
self.use_timm_backbone = use_timm_backbone
|
|
243
|
-
self.backbone_kwargs = backbone_kwargs
|
|
244
207
|
self.num_queries = num_queries
|
|
245
208
|
self.d_model = d_model
|
|
246
209
|
self.encoder_ffn_dim = encoder_ffn_dim
|
|
@@ -274,6 +237,7 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
274
237
|
text_config["model_type"] = text_config.get("model_type", "bert")
|
|
275
238
|
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
|
|
276
239
|
elif text_config is None:
|
|
240
|
+
logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).")
|
|
277
241
|
text_config = CONFIG_MAPPING["bert"]()
|
|
278
242
|
|
|
279
243
|
self.text_config = text_config
|
|
@@ -290,6 +254,7 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
290
254
|
self.positional_embedding_temperature = positional_embedding_temperature
|
|
291
255
|
self.init_std = init_std
|
|
292
256
|
self.layer_norm_eps = layer_norm_eps
|
|
257
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
293
258
|
|
|
294
259
|
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
|
|
295
260
|
|
|
@@ -124,9 +124,12 @@ class MobileBertConfig(PreTrainedConfig):
|
|
|
124
124
|
normalization_type="no_norm",
|
|
125
125
|
classifier_activation=True,
|
|
126
126
|
classifier_dropout=None,
|
|
127
|
+
tie_word_embeddings=True,
|
|
127
128
|
**kwargs,
|
|
128
129
|
):
|
|
129
|
-
super().__init__(
|
|
130
|
+
super().__init__(**kwargs)
|
|
131
|
+
self.pad_token_id = pad_token_id
|
|
132
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
130
133
|
|
|
131
134
|
self.vocab_size = vocab_size
|
|
132
135
|
self.hidden_size = hidden_size
|
|
@@ -204,9 +204,9 @@ class MobileBertSelfAttention(nn.Module):
|
|
|
204
204
|
key_layer = self.key(key_tensor).view(*hidden_shape).transpose(1, 2)
|
|
205
205
|
value_layer = self.value(value_tensor).view(*hidden_shape).transpose(1, 2)
|
|
206
206
|
|
|
207
|
-
attention_interface: Callable =
|
|
208
|
-
|
|
209
|
-
|
|
207
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
208
|
+
self.config._attn_implementation, eager_attention_forward
|
|
209
|
+
)
|
|
210
210
|
|
|
211
211
|
attn_output, attn_weights = attention_interface(
|
|
212
212
|
self,
|