transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -40,7 +40,7 @@ from .image_processing_dpt import DPTImageProcessorKwargs
|
|
|
40
40
|
if TYPE_CHECKING:
|
|
41
41
|
from ...modeling_outputs import DepthEstimatorOutput
|
|
42
42
|
|
|
43
|
-
|
|
43
|
+
import torchvision.transforms.v2.functional as tvF
|
|
44
44
|
|
|
45
45
|
|
|
46
46
|
def get_resize_output_image_size(
|
|
@@ -105,7 +105,7 @@ class DPTImageProcessorFast(BeitImageProcessorFast):
|
|
|
105
105
|
self,
|
|
106
106
|
image: "torch.Tensor",
|
|
107
107
|
size: SizeDict,
|
|
108
|
-
interpolation: Optional["
|
|
108
|
+
interpolation: Optional["tvF.InterpolationMode"] = None,
|
|
109
109
|
antialias: bool = True,
|
|
110
110
|
ensure_multiple_of: int | None = 1,
|
|
111
111
|
keep_aspect_ratio: bool = False,
|
|
@@ -169,7 +169,7 @@ class DPTImageProcessorFast(BeitImageProcessorFast):
|
|
|
169
169
|
pad_top, pad_bottom = _get_pad(height, size_divisor)
|
|
170
170
|
pad_left, pad_right = _get_pad(width, size_divisor)
|
|
171
171
|
padding = (pad_left, pad_top, pad_right, pad_bottom)
|
|
172
|
-
return
|
|
172
|
+
return tvF.pad(image, padding)
|
|
173
173
|
|
|
174
174
|
def _preprocess(
|
|
175
175
|
self,
|
|
@@ -177,7 +177,7 @@ class DPTImageProcessorFast(BeitImageProcessorFast):
|
|
|
177
177
|
do_reduce_labels: bool,
|
|
178
178
|
do_resize: bool,
|
|
179
179
|
size: SizeDict,
|
|
180
|
-
interpolation: Optional["
|
|
180
|
+
interpolation: Optional["tvF.InterpolationMode"],
|
|
181
181
|
do_center_crop: bool,
|
|
182
182
|
crop_size: SizeDict,
|
|
183
183
|
do_rescale: bool,
|
|
@@ -271,7 +271,7 @@ class EdgeTamConfig(PreTrainedConfig):
|
|
|
271
271
|
... )
|
|
272
272
|
|
|
273
273
|
>>> # Initializing a EdgeTamConfig with `"facebook/edgetam.1_hiera_tiny"` style configuration
|
|
274
|
-
>>> configuration =
|
|
274
|
+
>>> configuration = EdgeTamConfig()
|
|
275
275
|
|
|
276
276
|
>>> # Initializing a EdgeTamModel (with random weights) from the `"facebook/edgetam.1_hiera_tiny"` style configuration
|
|
277
277
|
>>> model = EdgeTamModel(configuration)
|
|
@@ -32,12 +32,13 @@ from transformers.utils.generic import OutputRecorder
|
|
|
32
32
|
|
|
33
33
|
from ... import initialization as init
|
|
34
34
|
from ...activations import ACT2FN
|
|
35
|
-
from ...
|
|
35
|
+
from ...modeling_layers import GradientCheckpointingLayer
|
|
36
|
+
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
|
|
36
37
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
37
38
|
from ...processing_utils import Unpack
|
|
38
39
|
from ...pytorch_utils import compile_compatible_method_lru_cache
|
|
39
|
-
from ...utils import ModelOutput, auto_docstring
|
|
40
|
-
from ...utils.generic import TransformersKwargs, check_model_inputs
|
|
40
|
+
from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging
|
|
41
|
+
from ...utils.generic import TransformersKwargs, check_model_inputs, is_flash_attention_requested
|
|
41
42
|
from ..auto import AutoModel
|
|
42
43
|
from .configuration_edgetam import (
|
|
43
44
|
EdgeTamConfig,
|
|
@@ -47,9 +48,7 @@ from .configuration_edgetam import (
|
|
|
47
48
|
)
|
|
48
49
|
|
|
49
50
|
|
|
50
|
-
|
|
51
|
-
if True:
|
|
52
|
-
from ..timm_wrapper.modeling_timm_wrapper import TimmWrapperModel
|
|
51
|
+
logger = logging.get_logger(__name__)
|
|
53
52
|
|
|
54
53
|
|
|
55
54
|
class EdgeTamLayerNorm(nn.LayerNorm):
|
|
@@ -80,16 +79,10 @@ class EdgeTamLayerNorm(nn.LayerNorm):
|
|
|
80
79
|
|
|
81
80
|
@dataclass
|
|
82
81
|
@auto_docstring(custom_intro="Base class for the vision encoder's outputs.")
|
|
83
|
-
class EdgeTamVisionEncoderOutput(
|
|
82
|
+
class EdgeTamVisionEncoderOutput(BaseModelOutputWithPooling):
|
|
84
83
|
r"""
|
|
85
84
|
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, height, width, hidden_size)`):
|
|
86
85
|
Sequence of hidden-states at the output of the last layer of the model.
|
|
87
|
-
fpn_hidden_states (`tuple(torch.FloatTensor)`):
|
|
88
|
-
Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
|
|
89
|
-
`(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
|
|
90
|
-
fpn_position_encoding (`tuple(torch.FloatTensor)`):
|
|
91
|
-
Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
|
|
92
|
-
`(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
|
|
93
86
|
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
|
94
87
|
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
|
95
88
|
one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. Hidden-states of the
|
|
@@ -98,13 +91,16 @@ class EdgeTamVisionEncoderOutput(ModelOutput):
|
|
|
98
91
|
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
|
99
92
|
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
|
100
93
|
the self-attention heads.
|
|
94
|
+
fpn_hidden_states (`tuple(torch.FloatTensor)`):
|
|
95
|
+
Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
|
|
96
|
+
`(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
|
|
97
|
+
fpn_position_encoding (`tuple(torch.FloatTensor)`):
|
|
98
|
+
Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
|
|
99
|
+
`(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
|
|
101
100
|
"""
|
|
102
101
|
|
|
103
|
-
last_hidden_state: torch.FloatTensor | None = None
|
|
104
102
|
fpn_hidden_states: torch.FloatTensor | None = None
|
|
105
103
|
fpn_position_encoding: torch.FloatTensor | None = None
|
|
106
|
-
hidden_states: tuple[torch.FloatTensor, ...] | None = None
|
|
107
|
-
attentions: tuple[torch.FloatTensor, ...] | None = None
|
|
108
104
|
|
|
109
105
|
|
|
110
106
|
def eager_attention_forward(
|
|
@@ -167,9 +163,18 @@ class EdgeTamAttention(nn.Module):
|
|
|
167
163
|
key = self.k_proj(key).view(*new_shape).transpose(1, 2)
|
|
168
164
|
value = self.v_proj(value).view(*new_shape).transpose(1, 2)
|
|
169
165
|
|
|
170
|
-
attention_interface: Callable =
|
|
171
|
-
|
|
172
|
-
|
|
166
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
167
|
+
self.config._attn_implementation, eager_attention_forward
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
if is_flash_attention_requested(self.config) and attention_similarity is not None:
|
|
171
|
+
# Target guided masks are represented as float masks and are incompatible with Flash Attention
|
|
172
|
+
# Fallback to SDPA for this call only so the rest of the model can still benefit from FA
|
|
173
|
+
attention_interface = ALL_ATTENTION_FUNCTIONS["sdpa"]
|
|
174
|
+
logger.warning_once(
|
|
175
|
+
"Falling back to SDPA for target-guided attention because "
|
|
176
|
+
"Flash Attention does not support additive bias masks."
|
|
177
|
+
)
|
|
173
178
|
|
|
174
179
|
attn_output, attn_weights = attention_interface(
|
|
175
180
|
self,
|
|
@@ -191,7 +196,7 @@ class EdgeTamAttention(nn.Module):
|
|
|
191
196
|
return attn_output, attn_weights
|
|
192
197
|
|
|
193
198
|
|
|
194
|
-
class EdgeTamTwoWayAttentionBlock(
|
|
199
|
+
class EdgeTamTwoWayAttentionBlock(GradientCheckpointingLayer):
|
|
195
200
|
def __init__(self, config: EdgeTamMaskDecoderConfig, skip_first_layer_pe: bool = False):
|
|
196
201
|
"""
|
|
197
202
|
A transformer block with four layers:
|
|
@@ -305,7 +310,7 @@ class EdgeTamPreTrainedModel(PreTrainedModel):
|
|
|
305
310
|
main_input_name = "pixel_values"
|
|
306
311
|
input_modalities = ("image",)
|
|
307
312
|
_supports_sdpa = True
|
|
308
|
-
|
|
313
|
+
_supports_flash_attn = True
|
|
309
314
|
_supports_attention_backend = True
|
|
310
315
|
|
|
311
316
|
@torch.no_grad()
|
|
@@ -425,7 +430,9 @@ class EdgeTamVisionNeck(nn.Module):
|
|
|
425
430
|
class EdgeTamVisionModel(EdgeTamPreTrainedModel):
|
|
426
431
|
config_class = EdgeTamVisionConfig
|
|
427
432
|
main_input_name = "pixel_values"
|
|
428
|
-
_can_record_outputs
|
|
433
|
+
# TODO: TimmWrapper models aren't compatible with _can_record_outputs yet. We specifically set this to
|
|
434
|
+
# an empty dict to avoid the _can_record_outputs from Sam2VisionModel being inherited here.
|
|
435
|
+
_can_record_outputs = {}
|
|
429
436
|
|
|
430
437
|
def __init__(self, config: EdgeTamVisionConfig):
|
|
431
438
|
super().__init__(config)
|
|
@@ -448,7 +455,7 @@ class EdgeTamVisionModel(EdgeTamPreTrainedModel):
|
|
|
448
455
|
raise ValueError("You have to specify pixel_values")
|
|
449
456
|
|
|
450
457
|
# Forward through backbone
|
|
451
|
-
backbone_output = self.backbone(pixel_values)
|
|
458
|
+
backbone_output = self.backbone(pixel_values, **kwargs)
|
|
452
459
|
intermediate_hidden_states = backbone_output.last_hidden_state
|
|
453
460
|
intermediate_hidden_states = [hidden_state.permute(0, 2, 3, 1) for hidden_state in intermediate_hidden_states]
|
|
454
461
|
|
|
@@ -461,6 +468,7 @@ class EdgeTamVisionModel(EdgeTamPreTrainedModel):
|
|
|
461
468
|
last_hidden_state=intermediate_hidden_states[-1],
|
|
462
469
|
fpn_hidden_states=fpn_hidden_states,
|
|
463
470
|
fpn_position_encoding=fpn_position_encoding,
|
|
471
|
+
hidden_states=backbone_output.hidden_states,
|
|
464
472
|
)
|
|
465
473
|
|
|
466
474
|
|
|
@@ -914,6 +922,7 @@ class EdgeTamMaskDecoder(nn.Module):
|
|
|
914
922
|
class EdgeTamModel(EdgeTamPreTrainedModel):
|
|
915
923
|
input_modalities = ("image", "text")
|
|
916
924
|
_can_record_outputs = {"mask_decoder_attentions": OutputRecorder(EdgeTamTwoWayAttentionBlock, index=2)}
|
|
925
|
+
_tied_weights_keys = {}
|
|
917
926
|
_keys_to_ignore_on_load_unexpected = [
|
|
918
927
|
r"^memory_.*",
|
|
919
928
|
r"^mask_downsample.*",
|
|
@@ -969,7 +978,8 @@ class EdgeTamModel(EdgeTamPreTrainedModel):
|
|
|
969
978
|
Input pixel values
|
|
970
979
|
"""
|
|
971
980
|
batch_size = pixel_values.shape[0]
|
|
972
|
-
|
|
981
|
+
image_outputs = self.get_image_features(pixel_values, return_dict=True, **kwargs)
|
|
982
|
+
feature_maps = image_outputs.fpn_hidden_states
|
|
973
983
|
|
|
974
984
|
# add no memory embedding to the last feature map
|
|
975
985
|
feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding
|
|
@@ -1088,14 +1098,16 @@ class EdgeTamModel(EdgeTamPreTrainedModel):
|
|
|
1088
1098
|
|
|
1089
1099
|
```python
|
|
1090
1100
|
>>> from PIL import Image
|
|
1091
|
-
>>> import
|
|
1101
|
+
>>> import httpx
|
|
1102
|
+
>>> from io import BytesIO
|
|
1092
1103
|
>>> from transformers import AutoModel, AutoProcessor
|
|
1093
1104
|
|
|
1094
1105
|
>>> model = AutoModel.from_pretrained("danelcsb/edgetam.1_hiera_tiny")
|
|
1095
1106
|
>>> processor = AutoProcessor.from_pretrained("danelcsb/edgetam.1_hiera_tiny")
|
|
1096
1107
|
|
|
1097
|
-
>>>
|
|
1098
|
-
>>>
|
|
1108
|
+
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
|
|
1109
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1110
|
+
... raw_image = Image.open(BytesIO(response.read())).convert("RGB")
|
|
1099
1111
|
>>> input_points = [[[400, 650]]] # 2D location of a window on the car
|
|
1100
1112
|
>>> inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt")
|
|
1101
1113
|
|
|
@@ -1125,10 +1137,12 @@ class EdgeTamModel(EdgeTamPreTrainedModel):
|
|
|
1125
1137
|
vision_hidden_states = None
|
|
1126
1138
|
|
|
1127
1139
|
if pixel_values is not None:
|
|
1128
|
-
|
|
1129
|
-
pixel_values,
|
|
1130
|
-
**kwargs,
|
|
1140
|
+
image_outputs: EdgeTamVisionEncoderOutput = self.get_image_features(
|
|
1141
|
+
pixel_values, return_dict=True, **kwargs
|
|
1131
1142
|
)
|
|
1143
|
+
feature_maps = image_outputs.fpn_hidden_states
|
|
1144
|
+
vision_hidden_states = image_outputs.hidden_states
|
|
1145
|
+
vision_attentions = image_outputs.attentions
|
|
1132
1146
|
|
|
1133
1147
|
# add no memory embedding to the last feature map
|
|
1134
1148
|
feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding
|
|
@@ -1188,34 +1202,18 @@ class EdgeTamModel(EdgeTamPreTrainedModel):
|
|
|
1188
1202
|
vision_attentions=vision_attentions,
|
|
1189
1203
|
)
|
|
1190
1204
|
|
|
1205
|
+
@can_return_tuple
|
|
1206
|
+
@auto_docstring
|
|
1191
1207
|
def get_image_features(
|
|
1192
1208
|
self,
|
|
1193
1209
|
pixel_values: torch.FloatTensor,
|
|
1194
1210
|
**kwargs: Unpack[TransformersKwargs],
|
|
1195
|
-
) -> tuple
|
|
1196
|
-
list[torch.Tensor],
|
|
1197
|
-
list[torch.Tensor],
|
|
1198
|
-
tuple[torch.FloatTensor, ...] | None,
|
|
1199
|
-
tuple[torch.FloatTensor, ...] | None,
|
|
1200
|
-
]:
|
|
1211
|
+
) -> tuple | EdgeTamVisionEncoderOutput:
|
|
1201
1212
|
r"""
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
Args:
|
|
1205
|
-
pixel_values (`torch.FloatTensor`):
|
|
1206
|
-
Input pixel values of shape `(batch_size, num_channels, height, width)`.
|
|
1207
|
-
|
|
1208
|
-
Returns:
|
|
1209
|
-
`tuple`: A tuple containing:
|
|
1210
|
-
- feature_maps (`list[torch.Tensor]`): List of feature maps from different levels.
|
|
1211
|
-
- feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level.
|
|
1212
|
-
- vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder.
|
|
1213
|
-
- vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder.
|
|
1213
|
+
pixel_values (`torch.FloatTensor`):
|
|
1214
|
+
Input pixel values of shape `(batch_size, num_channels, height, width)`.
|
|
1214
1215
|
"""
|
|
1215
|
-
vision_outputs: EdgeTamVisionEncoderOutput = self.vision_encoder(
|
|
1216
|
-
pixel_values,
|
|
1217
|
-
**kwargs,
|
|
1218
|
-
)
|
|
1216
|
+
vision_outputs: EdgeTamVisionEncoderOutput = self.vision_encoder(pixel_values, return_dict=True, **kwargs)
|
|
1219
1217
|
|
|
1220
1218
|
feature_maps = vision_outputs.fpn_hidden_states
|
|
1221
1219
|
feature_maps_position_embeddings = vision_outputs.fpn_position_encoding
|
|
@@ -1232,8 +1230,10 @@ class EdgeTamModel(EdgeTamPreTrainedModel):
|
|
|
1232
1230
|
feature_map_position_embedding.flatten(2).permute(2, 0, 1)
|
|
1233
1231
|
for feature_map_position_embedding in feature_maps_position_embeddings
|
|
1234
1232
|
]
|
|
1233
|
+
vision_outputs.fpn_hidden_states = feature_maps
|
|
1234
|
+
vision_outputs.fpn_position_encoding = feature_maps_position_embeddings
|
|
1235
1235
|
|
|
1236
|
-
return
|
|
1236
|
+
return vision_outputs
|
|
1237
1237
|
|
|
1238
1238
|
|
|
1239
1239
|
__all__ = ["EdgeTamModel", "EdgeTamVisionModel", "EdgeTamPreTrainedModel"]
|
|
@@ -37,11 +37,6 @@ from ..sam2.modeling_sam2 import (
|
|
|
37
37
|
)
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
# fix this in modular
|
|
41
|
-
if True:
|
|
42
|
-
from ..timm_wrapper.modeling_timm_wrapper import TimmWrapperModel
|
|
43
|
-
|
|
44
|
-
|
|
45
40
|
class EdgeTamVisionConfig(PreTrainedConfig):
|
|
46
41
|
r"""
|
|
47
42
|
This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM
|
|
@@ -188,7 +183,9 @@ class EdgeTamPreTrainedModel(Sam2PreTrainedModel):
|
|
|
188
183
|
class EdgeTamVisionModel(Sam2VisionModel):
|
|
189
184
|
config_class = EdgeTamVisionConfig
|
|
190
185
|
main_input_name = "pixel_values"
|
|
191
|
-
_can_record_outputs
|
|
186
|
+
# TODO: TimmWrapper models aren't compatible with _can_record_outputs yet. We specifically set this to
|
|
187
|
+
# an empty dict to avoid the _can_record_outputs from Sam2VisionModel being inherited here.
|
|
188
|
+
_can_record_outputs = {}
|
|
192
189
|
|
|
193
190
|
def get_input_embeddings(self):
|
|
194
191
|
raise NotImplementedError("Can't get input embeddings from timm wrapper model")
|
|
@@ -203,7 +200,7 @@ class EdgeTamVisionModel(Sam2VisionModel):
|
|
|
203
200
|
raise ValueError("You have to specify pixel_values")
|
|
204
201
|
|
|
205
202
|
# Forward through backbone
|
|
206
|
-
backbone_output = self.backbone(pixel_values)
|
|
203
|
+
backbone_output = self.backbone(pixel_values, **kwargs)
|
|
207
204
|
intermediate_hidden_states = backbone_output.last_hidden_state
|
|
208
205
|
intermediate_hidden_states = [hidden_state.permute(0, 2, 3, 1) for hidden_state in intermediate_hidden_states]
|
|
209
206
|
|
|
@@ -216,6 +213,7 @@ class EdgeTamVisionModel(Sam2VisionModel):
|
|
|
216
213
|
last_hidden_state=intermediate_hidden_states[-1],
|
|
217
214
|
fpn_hidden_states=fpn_hidden_states,
|
|
218
215
|
fpn_position_encoding=fpn_position_encoding,
|
|
216
|
+
hidden_states=backbone_output.hidden_states,
|
|
219
217
|
)
|
|
220
218
|
|
|
221
219
|
|
|
@@ -37,12 +37,12 @@ from ... import initialization as init
|
|
|
37
37
|
from ...activations import ACT2FN
|
|
38
38
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
39
39
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
40
|
-
from ...modeling_outputs import BaseModelOutput
|
|
40
|
+
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
|
|
41
41
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
42
42
|
from ...processing_utils import Unpack
|
|
43
43
|
from ...pytorch_utils import compile_compatible_method_lru_cache
|
|
44
|
-
from ...utils import ModelOutput, auto_docstring
|
|
45
|
-
from ...utils.generic import TransformersKwargs
|
|
44
|
+
from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging
|
|
45
|
+
from ...utils.generic import TransformersKwargs, is_flash_attention_requested
|
|
46
46
|
from ..auto import AutoModel
|
|
47
47
|
from .configuration_edgetam_video import (
|
|
48
48
|
EdgeTamVideoConfig,
|
|
@@ -51,6 +51,9 @@ from .configuration_edgetam_video import (
|
|
|
51
51
|
)
|
|
52
52
|
|
|
53
53
|
|
|
54
|
+
logger = logging.get_logger(__name__)
|
|
55
|
+
|
|
56
|
+
|
|
54
57
|
class EdgeTamVideoLayerNorm(nn.LayerNorm):
|
|
55
58
|
r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
|
|
56
59
|
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
|
|
@@ -116,16 +119,10 @@ class EdgeTamVideoMemoryFuserCXBlock(GradientCheckpointingLayer):
|
|
|
116
119
|
|
|
117
120
|
@dataclass
|
|
118
121
|
@auto_docstring(custom_intro="Base class for the vision encoder's outputs.")
|
|
119
|
-
class EdgeTamVideoVisionEncoderOutput(
|
|
122
|
+
class EdgeTamVideoVisionEncoderOutput(BaseModelOutputWithPooling):
|
|
120
123
|
r"""
|
|
121
124
|
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, height, width, hidden_size)`):
|
|
122
125
|
Sequence of hidden-states at the output of the last layer of the model.
|
|
123
|
-
fpn_hidden_states (`tuple(torch.FloatTensor)`):
|
|
124
|
-
Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
|
|
125
|
-
`(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
|
|
126
|
-
fpn_position_encoding (`tuple(torch.FloatTensor)`):
|
|
127
|
-
Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
|
|
128
|
-
`(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
|
|
129
126
|
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
|
130
127
|
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
|
131
128
|
one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. Hidden-states of the
|
|
@@ -134,13 +131,16 @@ class EdgeTamVideoVisionEncoderOutput(ModelOutput):
|
|
|
134
131
|
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
|
135
132
|
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
|
136
133
|
the self-attention heads.
|
|
134
|
+
fpn_hidden_states (`tuple(torch.FloatTensor)`):
|
|
135
|
+
Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
|
|
136
|
+
`(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
|
|
137
|
+
fpn_position_encoding (`tuple(torch.FloatTensor)`):
|
|
138
|
+
Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
|
|
139
|
+
`(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
|
|
137
140
|
"""
|
|
138
141
|
|
|
139
|
-
last_hidden_state: torch.FloatTensor | None = None
|
|
140
142
|
fpn_hidden_states: torch.FloatTensor | None = None
|
|
141
143
|
fpn_position_encoding: torch.FloatTensor | None = None
|
|
142
|
-
hidden_states: tuple[torch.FloatTensor, ...] | None = None
|
|
143
|
-
attentions: tuple[torch.FloatTensor, ...] | None = None
|
|
144
144
|
|
|
145
145
|
|
|
146
146
|
class EdgeTamVideoVisionRotaryEmbedding(nn.Module):
|
|
@@ -245,9 +245,18 @@ class EdgeTamVideoAttention(nn.Module):
|
|
|
245
245
|
key = self.k_proj(key).view(*new_shape).transpose(1, 2)
|
|
246
246
|
value = self.v_proj(value).view(*new_shape).transpose(1, 2)
|
|
247
247
|
|
|
248
|
-
attention_interface: Callable =
|
|
249
|
-
|
|
250
|
-
|
|
248
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
249
|
+
self.config._attn_implementation, eager_attention_forward
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if is_flash_attention_requested(self.config) and attention_similarity is not None:
|
|
253
|
+
# Target guided masks are represented as float masks and are incompatible with Flash Attention
|
|
254
|
+
# Fallback to SDPA for this call only so the rest of the model can still benefit from FA
|
|
255
|
+
attention_interface = ALL_ATTENTION_FUNCTIONS["sdpa"]
|
|
256
|
+
logger.warning_once(
|
|
257
|
+
"Falling back to SDPA for target-guided attention because "
|
|
258
|
+
"Flash Attention does not support additive bias masks."
|
|
259
|
+
)
|
|
251
260
|
|
|
252
261
|
attn_output, attn_weights = attention_interface(
|
|
253
262
|
self,
|
|
@@ -355,9 +364,9 @@ class EdgeTamVideoRoPESelfAttention(nn.Module):
|
|
|
355
364
|
# Apply rotary position encoding for self-attention
|
|
356
365
|
query, key = apply_rotary_pos_emb_2d_self_attn(query, key, cos=cos, sin=sin)
|
|
357
366
|
|
|
358
|
-
attention_interface: Callable =
|
|
359
|
-
|
|
360
|
-
|
|
367
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
368
|
+
self.config._attn_implementation, eager_attention_forward
|
|
369
|
+
)
|
|
361
370
|
|
|
362
371
|
attn_output, attn_weights = attention_interface(
|
|
363
372
|
self,
|
|
@@ -506,9 +515,9 @@ class EdgeTamVideoRoPECrossAttention(nn.Module):
|
|
|
506
515
|
num_k_exclude_rope=num_k_exclude_rope,
|
|
507
516
|
)
|
|
508
517
|
|
|
509
|
-
attention_interface: Callable =
|
|
510
|
-
|
|
511
|
-
|
|
518
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
519
|
+
self.config._attn_implementation, eager_attention_forward
|
|
520
|
+
)
|
|
512
521
|
|
|
513
522
|
attn_output, attn_weights = attention_interface(
|
|
514
523
|
self,
|
|
@@ -528,7 +537,7 @@ class EdgeTamVideoRoPECrossAttention(nn.Module):
|
|
|
528
537
|
return attn_output, attn_weights
|
|
529
538
|
|
|
530
539
|
|
|
531
|
-
class EdgeTamVideoTwoWayAttentionBlock(
|
|
540
|
+
class EdgeTamVideoTwoWayAttentionBlock(GradientCheckpointingLayer):
|
|
532
541
|
def __init__(self, config: EdgeTamVideoMaskDecoderConfig, skip_first_layer_pe: bool = False):
|
|
533
542
|
"""
|
|
534
543
|
A transformer block with four layers:
|
|
@@ -807,7 +816,7 @@ class EdgeTamVideoPreTrainedModel(PreTrainedModel):
|
|
|
807
816
|
main_input_name = "pixel_values"
|
|
808
817
|
input_modalities = "video"
|
|
809
818
|
_supports_sdpa = True
|
|
810
|
-
|
|
819
|
+
_supports_flash_attn = True
|
|
811
820
|
_supports_attention_backend = True
|
|
812
821
|
|
|
813
822
|
@torch.no_grad()
|
|
@@ -1322,9 +1331,9 @@ class EdgeTamVideoPerceiverAttention(nn.Module):
|
|
|
1322
1331
|
value = value + pos_encoding
|
|
1323
1332
|
|
|
1324
1333
|
# Apply attention
|
|
1325
|
-
attention_interface: Callable =
|
|
1326
|
-
|
|
1327
|
-
|
|
1334
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
1335
|
+
self.config._attn_implementation, eager_attention_forward
|
|
1336
|
+
)
|
|
1328
1337
|
|
|
1329
1338
|
attn_output, _ = attention_interface(
|
|
1330
1339
|
self,
|
|
@@ -1991,6 +2000,7 @@ def get_1d_sine_pe(pos_inds, dim, temperature=10000):
|
|
|
1991
2000
|
class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel):
|
|
1992
2001
|
input_modalities = ("video", "text")
|
|
1993
2002
|
_can_record_outputs = {"mask_decoder_attentions": OutputRecorder(EdgeTamVideoTwoWayAttentionBlock, index=2)}
|
|
2003
|
+
_tied_weights_keys = {}
|
|
1994
2004
|
_keys_to_ignore_on_load_unexpected = []
|
|
1995
2005
|
|
|
1996
2006
|
def __init__(self, config: EdgeTamVideoConfig):
|
|
@@ -2074,7 +2084,8 @@ class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel):
|
|
|
2074
2084
|
Input pixel values
|
|
2075
2085
|
"""
|
|
2076
2086
|
batch_size = pixel_values.shape[0]
|
|
2077
|
-
|
|
2087
|
+
image_outputs = self.get_image_features(pixel_values, return_dict=True, **kwargs)
|
|
2088
|
+
feature_maps = image_outputs.fpn_hidden_states
|
|
2078
2089
|
|
|
2079
2090
|
# add no memory embedding to the last feature map
|
|
2080
2091
|
feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding
|
|
@@ -2219,34 +2230,18 @@ class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel):
|
|
|
2219
2230
|
frame_idx=frame_idx,
|
|
2220
2231
|
)
|
|
2221
2232
|
|
|
2233
|
+
@can_return_tuple
|
|
2234
|
+
@auto_docstring
|
|
2222
2235
|
def get_image_features(
|
|
2223
2236
|
self,
|
|
2224
2237
|
pixel_values: torch.FloatTensor,
|
|
2225
2238
|
**kwargs: Unpack[TransformersKwargs],
|
|
2226
|
-
) -> tuple
|
|
2227
|
-
list[torch.Tensor],
|
|
2228
|
-
list[torch.Tensor],
|
|
2229
|
-
tuple[torch.FloatTensor, ...] | None,
|
|
2230
|
-
tuple[torch.FloatTensor, ...] | None,
|
|
2231
|
-
]:
|
|
2239
|
+
) -> tuple | EdgeTamVideoVisionEncoderOutput:
|
|
2232
2240
|
r"""
|
|
2233
|
-
|
|
2234
|
-
|
|
2235
|
-
Args:
|
|
2236
|
-
pixel_values (`torch.FloatTensor`):
|
|
2237
|
-
Input pixel values of shape `(batch_size, num_channels, height, width)`.
|
|
2238
|
-
|
|
2239
|
-
Returns:
|
|
2240
|
-
`tuple`: A tuple containing:
|
|
2241
|
-
- feature_maps (`list[torch.Tensor]`): List of feature maps from different levels.
|
|
2242
|
-
- feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level.
|
|
2243
|
-
- vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder.
|
|
2244
|
-
- vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder.
|
|
2241
|
+
pixel_values (`torch.FloatTensor`):
|
|
2242
|
+
Input pixel values of shape `(batch_size, num_channels, height, width)`.
|
|
2245
2243
|
"""
|
|
2246
|
-
vision_outputs: EdgeTamVideoVisionEncoderOutput = self.vision_encoder(
|
|
2247
|
-
pixel_values,
|
|
2248
|
-
**kwargs,
|
|
2249
|
-
)
|
|
2244
|
+
vision_outputs: EdgeTamVideoVisionEncoderOutput = self.vision_encoder(pixel_values, return_dict=True, **kwargs)
|
|
2250
2245
|
|
|
2251
2246
|
feature_maps = vision_outputs.fpn_hidden_states
|
|
2252
2247
|
feature_maps_position_embeddings = vision_outputs.fpn_position_encoding
|
|
@@ -2263,8 +2258,10 @@ class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel):
|
|
|
2263
2258
|
feature_map_position_embedding.flatten(2).permute(2, 0, 1)
|
|
2264
2259
|
for feature_map_position_embedding in feature_maps_position_embeddings
|
|
2265
2260
|
]
|
|
2261
|
+
vision_outputs.fpn_hidden_states = feature_maps
|
|
2262
|
+
vision_outputs.fpn_position_encoding = feature_maps_position_embeddings
|
|
2266
2263
|
|
|
2267
|
-
return
|
|
2264
|
+
return vision_outputs
|
|
2268
2265
|
|
|
2269
2266
|
def _prepare_vision_features(
|
|
2270
2267
|
self,
|
|
@@ -2281,7 +2278,9 @@ class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel):
|
|
|
2281
2278
|
else:
|
|
2282
2279
|
# Compute features using image encoder
|
|
2283
2280
|
image_batch = inference_session.get_frame(frame_idx).unsqueeze(0) # Add batch dimension
|
|
2284
|
-
|
|
2281
|
+
image_outputs = self.get_image_features(image_batch, return_dict=True)
|
|
2282
|
+
vision_feats = image_outputs.fpn_hidden_states
|
|
2283
|
+
vision_pos_embeds = image_outputs.fpn_position_encoding
|
|
2285
2284
|
# Cache features
|
|
2286
2285
|
inference_session.cache.cache_vision_features(
|
|
2287
2286
|
frame_idx, {"vision_feats": vision_feats, "vision_pos_embeds": vision_pos_embeds}
|
|
@@ -2386,10 +2385,10 @@ class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel):
|
|
|
2386
2385
|
vision_hidden_states = None
|
|
2387
2386
|
|
|
2388
2387
|
if pixel_values is not None:
|
|
2389
|
-
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2388
|
+
image_outputs = self.get_image_features(pixel_values, return_dict=True, **kwargs)
|
|
2389
|
+
feature_maps = image_outputs.fpn_hidden_states
|
|
2390
|
+
vision_hidden_states = image_outputs.hidden_states
|
|
2391
|
+
vision_attentions = image_outputs.attentions
|
|
2393
2392
|
|
|
2394
2393
|
# add no memory embedding to the last feature map
|
|
2395
2394
|
feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding
|
|
@@ -536,9 +536,9 @@ class EdgeTamVideoRoPESelfAttention(nn.Module):
|
|
|
536
536
|
# Apply rotary position encoding for self-attention
|
|
537
537
|
query, key = apply_rotary_pos_emb_2d_self_attn(query, key, cos=cos, sin=sin)
|
|
538
538
|
|
|
539
|
-
attention_interface: Callable =
|
|
540
|
-
|
|
541
|
-
|
|
539
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
540
|
+
self.config._attn_implementation, eager_attention_forward
|
|
541
|
+
)
|
|
542
542
|
|
|
543
543
|
attn_output, attn_weights = attention_interface(
|
|
544
544
|
self,
|
|
@@ -612,9 +612,9 @@ class EdgeTamVideoRoPECrossAttention(nn.Module):
|
|
|
612
612
|
num_k_exclude_rope=num_k_exclude_rope,
|
|
613
613
|
)
|
|
614
614
|
|
|
615
|
-
attention_interface: Callable =
|
|
616
|
-
|
|
617
|
-
|
|
615
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
616
|
+
self.config._attn_implementation, eager_attention_forward
|
|
617
|
+
)
|
|
618
618
|
|
|
619
619
|
attn_output, attn_weights = attention_interface(
|
|
620
620
|
self,
|
|
@@ -854,9 +854,9 @@ class EdgeTamVideoPerceiverAttention(nn.Module):
|
|
|
854
854
|
value = value + pos_encoding
|
|
855
855
|
|
|
856
856
|
# Apply attention
|
|
857
|
-
attention_interface: Callable =
|
|
858
|
-
|
|
859
|
-
|
|
857
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
858
|
+
self.config._attn_implementation, eager_attention_forward
|
|
859
|
+
)
|
|
860
860
|
|
|
861
861
|
attn_output, _ = attention_interface(
|
|
862
862
|
self,
|
|
@@ -7,8 +7,8 @@
|
|
|
7
7
|
from typing import Optional
|
|
8
8
|
|
|
9
9
|
import torch
|
|
10
|
+
import torchvision.transforms.v2.functional as tvF
|
|
10
11
|
from PIL import Image, ImageDraw
|
|
11
|
-
from torchvision.transforms.v2 import functional as F
|
|
12
12
|
|
|
13
13
|
from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
|
|
14
14
|
from ...image_transforms import group_images_by_shape, reorder_images
|
|
@@ -84,7 +84,7 @@ def convert_to_grayscale(
|
|
|
84
84
|
"""
|
|
85
85
|
if is_grayscale(image):
|
|
86
86
|
return image
|
|
87
|
-
return
|
|
87
|
+
return tvF.rgb_to_grayscale(image, num_output_channels=3)
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
@auto_docstring
|
|
@@ -111,6 +111,7 @@ class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
|
|
|
111
111
|
**kwargs,
|
|
112
112
|
) -> ImageInput:
|
|
113
113
|
# we need to handle image pairs validation and flattening
|
|
114
|
+
images = self.fetch_images(images)
|
|
114
115
|
return flatten_pair_images(images)
|
|
115
116
|
|
|
116
117
|
def _preprocess(
|
|
@@ -120,7 +121,7 @@ class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
|
|
|
120
121
|
rescale_factor: float,
|
|
121
122
|
do_rescale: bool,
|
|
122
123
|
do_resize: bool,
|
|
123
|
-
interpolation: Optional["
|
|
124
|
+
interpolation: Optional["tvF.InterpolationMode"],
|
|
124
125
|
do_grayscale: bool,
|
|
125
126
|
disable_grouping: bool,
|
|
126
127
|
return_tensors: str | TensorType,
|