transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -27,8 +27,8 @@ import pathlib
|
|
|
27
27
|
from typing import TYPE_CHECKING, Any, Optional
|
|
28
28
|
|
|
29
29
|
import torch
|
|
30
|
+
import torchvision.transforms.v2.functional as tvF
|
|
30
31
|
from torchvision.io import read_image
|
|
31
|
-
from torchvision.transforms.v2 import functional as F
|
|
32
32
|
|
|
33
33
|
from ...image_processing_utils import BatchFeature, get_size_dict
|
|
34
34
|
from ...image_processing_utils_fast import (
|
|
@@ -354,7 +354,7 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
|
|
|
354
354
|
self,
|
|
355
355
|
image: torch.Tensor,
|
|
356
356
|
size: SizeDict,
|
|
357
|
-
interpolation: Optional["
|
|
357
|
+
interpolation: Optional["tvF.InterpolationMode"] = None,
|
|
358
358
|
**kwargs,
|
|
359
359
|
) -> torch.Tensor:
|
|
360
360
|
"""
|
|
@@ -377,7 +377,7 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
|
|
|
377
377
|
interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
|
|
378
378
|
Resampling filter to use if resizing the image.
|
|
379
379
|
"""
|
|
380
|
-
interpolation = interpolation if interpolation is not None else
|
|
380
|
+
interpolation = interpolation if interpolation is not None else tvF.InterpolationMode.BILINEAR
|
|
381
381
|
if size.shortest_edge and size.longest_edge:
|
|
382
382
|
# Resize the image so that the shortest edge or the longest edge is of the given size
|
|
383
383
|
# while maintaining the aspect ratio of the original image.
|
|
@@ -396,7 +396,7 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
|
|
|
396
396
|
f" {size.keys()}."
|
|
397
397
|
)
|
|
398
398
|
|
|
399
|
-
image =
|
|
399
|
+
image = tvF.resize(
|
|
400
400
|
image,
|
|
401
401
|
size=new_size,
|
|
402
402
|
interpolation=interpolation,
|
|
@@ -410,7 +410,7 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
|
|
|
410
410
|
orig_size: tuple[int, int],
|
|
411
411
|
target_size: tuple[int, int],
|
|
412
412
|
threshold: float = 0.5,
|
|
413
|
-
interpolation: Optional["
|
|
413
|
+
interpolation: Optional["tvF.InterpolationMode"] = None,
|
|
414
414
|
):
|
|
415
415
|
"""
|
|
416
416
|
Resizes an annotation to a target size.
|
|
@@ -424,10 +424,10 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
|
|
|
424
424
|
The target size of the image, as returned by the preprocessing `resize` step.
|
|
425
425
|
threshold (`float`, *optional*, defaults to 0.5):
|
|
426
426
|
The threshold used to binarize the segmentation masks.
|
|
427
|
-
resample (`InterpolationMode`, defaults to `
|
|
427
|
+
resample (`InterpolationMode`, defaults to `tvF.InterpolationMode.NEAREST_EXACT`):
|
|
428
428
|
The resampling filter to use when resizing the masks.
|
|
429
429
|
"""
|
|
430
|
-
interpolation = interpolation if interpolation is not None else
|
|
430
|
+
interpolation = interpolation if interpolation is not None else tvF.InterpolationMode.NEAREST_EXACT
|
|
431
431
|
ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
|
|
432
432
|
|
|
433
433
|
new_annotation = {}
|
|
@@ -446,7 +446,7 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
|
|
|
446
446
|
new_annotation["area"] = scaled_area
|
|
447
447
|
elif key == "masks":
|
|
448
448
|
masks = value[:, None]
|
|
449
|
-
masks = [
|
|
449
|
+
masks = [tvF.resize(mask, target_size, interpolation=interpolation) for mask in masks]
|
|
450
450
|
masks = torch.stack(masks).to(torch.float32)
|
|
451
451
|
masks = masks[:, 0] > threshold
|
|
452
452
|
new_annotation["masks"] = masks
|
|
@@ -490,7 +490,7 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
|
|
|
490
490
|
for key, value in annotation.items():
|
|
491
491
|
if key == "masks":
|
|
492
492
|
masks = value
|
|
493
|
-
masks =
|
|
493
|
+
masks = tvF.pad(
|
|
494
494
|
masks,
|
|
495
495
|
padding,
|
|
496
496
|
fill=0,
|
|
@@ -525,7 +525,7 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
|
|
|
525
525
|
)
|
|
526
526
|
if original_size != padded_size:
|
|
527
527
|
padding = [0, 0, padding_right, padding_bottom]
|
|
528
|
-
image =
|
|
528
|
+
image = tvF.pad(image, padding, fill=fill)
|
|
529
529
|
if annotation is not None:
|
|
530
530
|
annotation = self._update_annotation_for_padded_image(
|
|
531
531
|
annotation, original_size, padded_size, padding, update_bboxes
|
|
@@ -545,7 +545,7 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
|
|
|
545
545
|
return_segmentation_masks: bool,
|
|
546
546
|
do_resize: bool,
|
|
547
547
|
size: SizeDict,
|
|
548
|
-
interpolation: Optional["
|
|
548
|
+
interpolation: Optional["tvF.InterpolationMode"],
|
|
549
549
|
do_rescale: bool,
|
|
550
550
|
rescale_factor: float,
|
|
551
551
|
do_normalize: bool,
|
|
@@ -23,20 +23,16 @@ from torch import Tensor, nn
|
|
|
23
23
|
|
|
24
24
|
from ... import initialization as init
|
|
25
25
|
from ...activations import ACT2FN
|
|
26
|
-
from ...
|
|
26
|
+
from ...backbone_utils import load_backbone
|
|
27
|
+
from ...file_utils import ModelOutput
|
|
27
28
|
from ...integrations import use_kernel_forward_from_hub
|
|
28
29
|
from ...modeling_utils import PreTrainedModel
|
|
29
30
|
from ...pytorch_utils import meshgrid
|
|
30
|
-
from ...utils import auto_docstring, logging
|
|
31
|
-
from ...utils.backbone_utils import load_backbone
|
|
31
|
+
from ...utils import auto_docstring, logging, torch_compilable_check
|
|
32
32
|
from ..auto import AutoModel
|
|
33
33
|
from .configuration_grounding_dino import GroundingDinoConfig
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
if is_timm_available():
|
|
37
|
-
from timm import create_model
|
|
38
|
-
|
|
39
|
-
|
|
40
36
|
logger = logging.get_logger(__name__)
|
|
41
37
|
|
|
42
38
|
|
|
@@ -373,47 +369,23 @@ class GroundingDinoConvEncoder(nn.Module):
|
|
|
373
369
|
super().__init__()
|
|
374
370
|
|
|
375
371
|
self.config = config
|
|
376
|
-
|
|
377
|
-
if config.use_timm_backbone:
|
|
378
|
-
requires_backends(self, ["timm"])
|
|
379
|
-
backbone = create_model(
|
|
380
|
-
config.backbone,
|
|
381
|
-
pretrained=config.use_pretrained_backbone,
|
|
382
|
-
features_only=True,
|
|
383
|
-
**config.backbone_kwargs,
|
|
384
|
-
)
|
|
385
|
-
else:
|
|
386
|
-
backbone = load_backbone(config)
|
|
372
|
+
backbone = load_backbone(config)
|
|
387
373
|
|
|
388
374
|
# replace batch norm by frozen batch norm
|
|
389
375
|
with torch.no_grad():
|
|
390
376
|
replace_batch_norm(backbone)
|
|
391
377
|
self.model = backbone
|
|
392
|
-
self.intermediate_channel_sizes =
|
|
393
|
-
self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
|
|
394
|
-
)
|
|
395
|
-
|
|
396
|
-
backbone_model_type = None
|
|
397
|
-
if config.backbone is not None:
|
|
398
|
-
backbone_model_type = config.backbone
|
|
399
|
-
elif config.backbone_config is not None:
|
|
400
|
-
backbone_model_type = config.backbone_config.model_type
|
|
401
|
-
else:
|
|
402
|
-
raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
|
|
378
|
+
self.intermediate_channel_sizes = self.model.channels
|
|
403
379
|
|
|
380
|
+
backbone_model_type = config.backbone_config.model_type
|
|
404
381
|
if "resnet" in backbone_model_type:
|
|
405
382
|
for name, parameter in self.model.named_parameters():
|
|
406
|
-
if
|
|
407
|
-
|
|
408
|
-
parameter.requires_grad_(False)
|
|
409
|
-
else:
|
|
410
|
-
if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
|
|
411
|
-
parameter.requires_grad_(False)
|
|
383
|
+
if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
|
|
384
|
+
parameter.requires_grad_(False)
|
|
412
385
|
|
|
413
|
-
# Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->GroundingDino
|
|
414
386
|
def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
|
|
415
387
|
# send pixel_values through the model to get list of feature maps
|
|
416
|
-
features = self.model(pixel_values
|
|
388
|
+
features = self.model(pixel_values, return_dict=True).feature_maps
|
|
417
389
|
|
|
418
390
|
out = []
|
|
419
391
|
for feature_map in features:
|
|
@@ -423,7 +395,7 @@ class GroundingDinoConvEncoder(nn.Module):
|
|
|
423
395
|
return out
|
|
424
396
|
|
|
425
397
|
|
|
426
|
-
# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDino
|
|
398
|
+
# TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDino
|
|
427
399
|
class GroundingDinoConvModel(nn.Module):
|
|
428
400
|
"""
|
|
429
401
|
This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
|
|
@@ -549,9 +521,6 @@ class GroundingDinoMultiscaleDeformableAttention(nn.Module):
|
|
|
549
521
|
|
|
550
522
|
self.disable_custom_kernels = config.disable_custom_kernels
|
|
551
523
|
|
|
552
|
-
def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Tensor | None):
|
|
553
|
-
return tensor if position_embeddings is None else tensor + position_embeddings
|
|
554
|
-
|
|
555
524
|
def forward(
|
|
556
525
|
self,
|
|
557
526
|
hidden_states: torch.Tensor,
|
|
@@ -567,15 +536,15 @@ class GroundingDinoMultiscaleDeformableAttention(nn.Module):
|
|
|
567
536
|
):
|
|
568
537
|
# add position embeddings to the hidden states before projecting to queries and keys
|
|
569
538
|
if position_embeddings is not None:
|
|
570
|
-
hidden_states =
|
|
539
|
+
hidden_states = hidden_states + position_embeddings
|
|
571
540
|
|
|
572
541
|
batch_size, num_queries, _ = hidden_states.shape
|
|
573
542
|
batch_size, sequence_length, _ = encoder_hidden_states.shape
|
|
574
543
|
# Ignore copy
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
544
|
+
torch_compilable_check(
|
|
545
|
+
(spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == sequence_length,
|
|
546
|
+
"Make sure to align the spatial shapes with the sequence length of the encoder hidden states",
|
|
547
|
+
)
|
|
579
548
|
|
|
580
549
|
value = self.value_proj(encoder_hidden_states)
|
|
581
550
|
if attention_mask is not None:
|
|
@@ -1461,12 +1430,12 @@ class GroundingDinoEncoder(GroundingDinoPreTrainedModel):
|
|
|
1461
1430
|
self.post_init()
|
|
1462
1431
|
|
|
1463
1432
|
@staticmethod
|
|
1464
|
-
def get_reference_points(
|
|
1433
|
+
def get_reference_points(spatial_shapes_list, valid_ratios, device):
|
|
1465
1434
|
"""
|
|
1466
1435
|
Get reference points for each feature map.
|
|
1467
1436
|
|
|
1468
1437
|
Args:
|
|
1469
|
-
|
|
1438
|
+
spatial_shapes_list (`list[tuple[int, int]]`):
|
|
1470
1439
|
Spatial shapes of each feature map.
|
|
1471
1440
|
valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
|
|
1472
1441
|
Valid ratios of each feature map.
|
|
@@ -1476,7 +1445,7 @@ class GroundingDinoEncoder(GroundingDinoPreTrainedModel):
|
|
|
1476
1445
|
`torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
|
|
1477
1446
|
"""
|
|
1478
1447
|
reference_points_list = []
|
|
1479
|
-
for level, (height, width) in enumerate(
|
|
1448
|
+
for level, (height, width) in enumerate(spatial_shapes_list):
|
|
1480
1449
|
ref_y, ref_x = meshgrid(
|
|
1481
1450
|
torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
|
|
1482
1451
|
torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
|
|
@@ -1559,7 +1528,7 @@ class GroundingDinoEncoder(GroundingDinoPreTrainedModel):
|
|
|
1559
1528
|
)
|
|
1560
1529
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
1561
1530
|
|
|
1562
|
-
reference_points = self.get_reference_points(
|
|
1531
|
+
reference_points = self.get_reference_points(spatial_shapes_list, valid_ratios, device=vision_features.device)
|
|
1563
1532
|
|
|
1564
1533
|
encoder_vision_states = () if output_hidden_states else None
|
|
1565
1534
|
encoder_text_states = () if output_hidden_states else None
|
|
@@ -1867,33 +1836,42 @@ def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTen
|
|
|
1867
1836
|
- **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`)
|
|
1868
1837
|
- **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`)
|
|
1869
1838
|
"""
|
|
1870
|
-
batch_size,
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
#
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
# generate attention mask and positional ids
|
|
1880
|
-
attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1)
|
|
1881
|
-
position_ids = torch.zeros((batch_size, num_token), device=input_ids.device)
|
|
1882
|
-
previous_col = 0
|
|
1883
|
-
for i in range(idxs.shape[0]):
|
|
1884
|
-
row, col = idxs[i]
|
|
1885
|
-
if (col == 0) or (col == num_token - 1):
|
|
1886
|
-
attention_mask[row, col, col] = True
|
|
1887
|
-
position_ids[row, col] = 0
|
|
1888
|
-
else:
|
|
1889
|
-
attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
|
|
1890
|
-
position_ids[row, previous_col + 1 : col + 1] = torch.arange(
|
|
1891
|
-
0, col - previous_col, device=input_ids.device
|
|
1892
|
-
)
|
|
1839
|
+
batch_size, seq_len = input_ids.shape
|
|
1840
|
+
device = input_ids.device
|
|
1841
|
+
|
|
1842
|
+
# Identify special token positions
|
|
1843
|
+
special_mask = torch.isin(input_ids, torch.tensor(SPECIAL_TOKENS, device=device))
|
|
1844
|
+
|
|
1845
|
+
# For each position, find the previous and next special token indices
|
|
1846
|
+
indices = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1)
|
|
1893
1847
|
|
|
1894
|
-
|
|
1848
|
+
# Previous special token: cummax of special token indices
|
|
1849
|
+
prev_special = torch.where(special_mask, indices, torch.tensor(-1, device=device))
|
|
1850
|
+
prev_special = torch.cummax(prev_special, dim=1)[0]
|
|
1895
1851
|
|
|
1896
|
-
|
|
1852
|
+
# Next special token: flip, cummin, flip back
|
|
1853
|
+
next_special = torch.where(special_mask, indices, torch.tensor(seq_len, device=device))
|
|
1854
|
+
next_special = torch.flip(torch.cummin(torch.flip(next_special, dims=[1]), dim=1)[0], dims=[1])
|
|
1855
|
+
|
|
1856
|
+
# Tokens with the same next_special belong to the same block
|
|
1857
|
+
# Exclude blocks whose closing delimiter is at position 0 or seq_len-1
|
|
1858
|
+
valid_block = (next_special != 0) & (next_special != seq_len - 1) & (next_special != seq_len)
|
|
1859
|
+
|
|
1860
|
+
# Build attention mask: tokens attend to each other if they share the same next_special
|
|
1861
|
+
next_i = next_special.unsqueeze(2) # (B, N, 1)
|
|
1862
|
+
next_j = next_special.unsqueeze(1) # (B, 1, N)
|
|
1863
|
+
attention_mask = (next_i == next_j) & valid_block.unsqueeze(1)
|
|
1864
|
+
|
|
1865
|
+
# Always allow self-attention
|
|
1866
|
+
identity = torch.eye(seq_len, device=device, dtype=torch.bool).unsqueeze(0).expand(batch_size, -1, -1)
|
|
1867
|
+
attention_mask = identity | attention_mask
|
|
1868
|
+
|
|
1869
|
+
# Position IDs: distance from previous special token
|
|
1870
|
+
position_ids = indices - prev_special - 1
|
|
1871
|
+
position_ids = torch.where(valid_block, position_ids, torch.zeros_like(position_ids))
|
|
1872
|
+
position_ids = torch.clamp(position_ids, min=0).to(torch.long)
|
|
1873
|
+
|
|
1874
|
+
return attention_mask, position_ids
|
|
1897
1875
|
|
|
1898
1876
|
|
|
1899
1877
|
@auto_docstring(
|
|
@@ -1993,13 +1971,13 @@ class GroundingDinoModel(GroundingDinoPreTrainedModel):
|
|
|
1993
1971
|
valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
|
|
1994
1972
|
return valid_ratio
|
|
1995
1973
|
|
|
1996
|
-
def generate_encoder_output_proposals(self, enc_output, padding_mask,
|
|
1974
|
+
def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes_list):
|
|
1997
1975
|
"""Generate the encoder output proposals from encoded enc_output.
|
|
1998
1976
|
|
|
1999
1977
|
Args:
|
|
2000
1978
|
enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder.
|
|
2001
1979
|
padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`.
|
|
2002
|
-
|
|
1980
|
+
spatial_shapes_list (`list[tuple[int, int]]`): Spatial shapes of each feature map.
|
|
2003
1981
|
|
|
2004
1982
|
Returns:
|
|
2005
1983
|
`tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
|
|
@@ -2011,7 +1989,7 @@ class GroundingDinoModel(GroundingDinoPreTrainedModel):
|
|
|
2011
1989
|
batch_size = enc_output.shape[0]
|
|
2012
1990
|
proposals = []
|
|
2013
1991
|
current_position = 0
|
|
2014
|
-
for level, (height, width) in enumerate(
|
|
1992
|
+
for level, (height, width) in enumerate(spatial_shapes_list):
|
|
2015
1993
|
mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)]
|
|
2016
1994
|
mask_flatten_ = mask_flatten_.view(batch_size, height, width, 1)
|
|
2017
1995
|
valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
|
|
@@ -2075,10 +2053,12 @@ class GroundingDinoModel(GroundingDinoPreTrainedModel):
|
|
|
2075
2053
|
```python
|
|
2076
2054
|
>>> from transformers import AutoProcessor, AutoModel
|
|
2077
2055
|
>>> from PIL import Image
|
|
2078
|
-
>>> import
|
|
2056
|
+
>>> import httpx
|
|
2057
|
+
>>> from io import BytesIO
|
|
2079
2058
|
|
|
2080
2059
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
2081
|
-
>>>
|
|
2060
|
+
>>> with httpx.stream("GET", url) as response:
|
|
2061
|
+
... image = Image.open(BytesIO(response.read()))
|
|
2082
2062
|
>>> text = "a cat."
|
|
2083
2063
|
|
|
2084
2064
|
>>> processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
|
|
@@ -2226,7 +2206,7 @@ class GroundingDinoModel(GroundingDinoPreTrainedModel):
|
|
|
2226
2206
|
encoder_pred_boxes = None
|
|
2227
2207
|
if self.config.two_stage:
|
|
2228
2208
|
object_query_embedding, output_proposals = self.generate_encoder_output_proposals(
|
|
2229
|
-
encoder_outputs[0], ~mask_flatten,
|
|
2209
|
+
encoder_outputs[0], ~mask_flatten, spatial_shapes_list
|
|
2230
2210
|
)
|
|
2231
2211
|
|
|
2232
2212
|
# hack implementation as in two-stage Deformable DETR
|
|
@@ -2324,8 +2304,6 @@ class GroundingDinoMLPPredictionHead(nn.Module):
|
|
|
2324
2304
|
Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
|
|
2325
2305
|
height and width of a bounding box w.r.t. an image.
|
|
2326
2306
|
|
|
2327
|
-
Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
|
|
2328
|
-
|
|
2329
2307
|
"""
|
|
2330
2308
|
|
|
2331
2309
|
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
|
|
@@ -2426,6 +2404,8 @@ class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
|
|
|
2426
2404
|
|
|
2427
2405
|
self.model = GroundingDinoModel(config)
|
|
2428
2406
|
if not config.decoder_bbox_embed_share:
|
|
2407
|
+
# Convert to instance attribute before modifying
|
|
2408
|
+
self._tied_weights_keys = self._tied_weights_keys.copy()
|
|
2429
2409
|
del self._tied_weights_keys[r"bbox_embed.(?![0])\d+"]
|
|
2430
2410
|
|
|
2431
2411
|
self.bbox_embed = nn.ModuleList(
|
|
@@ -2483,7 +2463,8 @@ class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
|
|
|
2483
2463
|
Examples:
|
|
2484
2464
|
|
|
2485
2465
|
```python
|
|
2486
|
-
>>> import
|
|
2466
|
+
>>> import httpx
|
|
2467
|
+
>>> from io import BytesIO
|
|
2487
2468
|
|
|
2488
2469
|
>>> import torch
|
|
2489
2470
|
>>> from PIL import Image
|
|
@@ -2495,8 +2476,9 @@ class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel):
|
|
|
2495
2476
|
>>> processor = AutoProcessor.from_pretrained(model_id)
|
|
2496
2477
|
>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
|
|
2497
2478
|
|
|
2498
|
-
>>>
|
|
2499
|
-
>>>
|
|
2479
|
+
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
2480
|
+
>>> with httpx.stream("GET", url) as response:
|
|
2481
|
+
... image = Image.open(BytesIO(response.read()))
|
|
2500
2482
|
>>> # Check for cats and remote controls
|
|
2501
2483
|
>>> text_labels = [["a cat", "a remote control"]]
|
|
2502
2484
|
|
|
@@ -96,7 +96,10 @@ class GroupViTTextConfig(PreTrainedConfig):
|
|
|
96
96
|
eos_token_id=49407,
|
|
97
97
|
**kwargs,
|
|
98
98
|
):
|
|
99
|
-
super().__init__(
|
|
99
|
+
super().__init__(**kwargs)
|
|
100
|
+
self.pad_token_id = pad_token_id
|
|
101
|
+
self.bos_token_id = bos_token_id
|
|
102
|
+
self.eos_token_id = eos_token_id
|
|
100
103
|
|
|
101
104
|
self.vocab_size = vocab_size
|
|
102
105
|
self.hidden_size = hidden_size
|
|
@@ -27,7 +27,8 @@ from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepa
|
|
|
27
27
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
28
28
|
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
|
|
29
29
|
from ...modeling_utils import PreTrainedModel
|
|
30
|
-
from ...
|
|
30
|
+
from ...processing_utils import Unpack
|
|
31
|
+
from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int
|
|
31
32
|
from .configuration_groupvit import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
|
|
32
33
|
|
|
33
34
|
|
|
@@ -1157,14 +1158,16 @@ class GroupViTVisionModel(GroupViTPreTrainedModel):
|
|
|
1157
1158
|
|
|
1158
1159
|
```python
|
|
1159
1160
|
>>> from PIL import Image
|
|
1160
|
-
>>> import
|
|
1161
|
+
>>> import httpx
|
|
1162
|
+
>>> from io import BytesIO
|
|
1161
1163
|
>>> from transformers import AutoProcessor, GroupViTVisionModel
|
|
1162
1164
|
|
|
1163
1165
|
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
|
|
1164
1166
|
>>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
|
|
1165
1167
|
|
|
1166
1168
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
1167
|
-
>>>
|
|
1169
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1170
|
+
... image = Image.open(BytesIO(response.read()))
|
|
1168
1171
|
|
|
1169
1172
|
>>> inputs = processor(images=image, return_tensors="pt")
|
|
1170
1173
|
|
|
@@ -1227,19 +1230,16 @@ class GroupViTModel(GroupViTPreTrainedModel):
|
|
|
1227
1230
|
# Initialize weights and apply final processing
|
|
1228
1231
|
self.post_init()
|
|
1229
1232
|
|
|
1230
|
-
@
|
|
1233
|
+
@can_return_tuple
|
|
1231
1234
|
@auto_docstring
|
|
1232
1235
|
def get_text_features(
|
|
1233
1236
|
self,
|
|
1234
1237
|
input_ids: torch.Tensor,
|
|
1235
1238
|
attention_mask: torch.Tensor | None = None,
|
|
1236
1239
|
position_ids: torch.Tensor | None = None,
|
|
1237
|
-
|
|
1240
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
1241
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
1238
1242
|
r"""
|
|
1239
|
-
Returns:
|
|
1240
|
-
text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
|
|
1241
|
-
applying the projection layer to the pooled output of [`GroupViTTextModel`].
|
|
1242
|
-
|
|
1243
1243
|
Examples:
|
|
1244
1244
|
|
|
1245
1245
|
```python
|
|
@@ -1257,18 +1257,22 @@ class GroupViTModel(GroupViTPreTrainedModel):
|
|
|
1257
1257
|
input_ids=input_ids,
|
|
1258
1258
|
attention_mask=attention_mask,
|
|
1259
1259
|
position_ids=position_ids,
|
|
1260
|
+
return_dict=True,
|
|
1261
|
+
**kwargs,
|
|
1260
1262
|
)
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
+
pooled_output = text_outputs.pooler_output
|
|
1264
|
+
text_outputs.pooler_output = self.text_projection(pooled_output)
|
|
1263
1265
|
|
|
1264
|
-
|
|
1266
|
+
return text_outputs
|
|
1267
|
+
|
|
1268
|
+
@can_return_tuple
|
|
1265
1269
|
@auto_docstring
|
|
1266
|
-
def get_image_features(
|
|
1270
|
+
def get_image_features(
|
|
1271
|
+
self,
|
|
1272
|
+
pixel_values: torch.Tensor,
|
|
1273
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
1274
|
+
) -> tuple | BaseModelOutputWithPooling:
|
|
1267
1275
|
r"""
|
|
1268
|
-
Returns:
|
|
1269
|
-
image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
|
|
1270
|
-
applying the projection layer to the pooled output of [`GroupViTVisionModel`].
|
|
1271
|
-
|
|
1272
1276
|
Examples:
|
|
1273
1277
|
|
|
1274
1278
|
```python
|
|
@@ -1287,9 +1291,10 @@ class GroupViTModel(GroupViTPreTrainedModel):
|
|
|
1287
1291
|
>>> with torch.inference_mode():
|
|
1288
1292
|
... image_features = model.get_image_features(**inputs)
|
|
1289
1293
|
```"""
|
|
1290
|
-
vision_outputs: BaseModelOutputWithPooling = self.vision_model(pixel_values)
|
|
1291
|
-
|
|
1292
|
-
|
|
1294
|
+
vision_outputs: BaseModelOutputWithPooling = self.vision_model(pixel_values, return_dict=True, **kwargs)
|
|
1295
|
+
vision_outputs.pooler_output = self.visual_projection(vision_outputs.pooler_output)
|
|
1296
|
+
|
|
1297
|
+
return vision_outputs
|
|
1293
1298
|
|
|
1294
1299
|
@auto_docstring
|
|
1295
1300
|
def forward(
|
|
@@ -1315,14 +1320,16 @@ class GroupViTModel(GroupViTPreTrainedModel):
|
|
|
1315
1320
|
|
|
1316
1321
|
```python
|
|
1317
1322
|
>>> from PIL import Image
|
|
1318
|
-
>>> import
|
|
1323
|
+
>>> import httpx
|
|
1324
|
+
>>> from io import BytesIO
|
|
1319
1325
|
>>> from transformers import AutoProcessor, GroupViTModel
|
|
1320
1326
|
|
|
1321
1327
|
>>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
|
|
1322
1328
|
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
|
|
1323
1329
|
|
|
1324
1330
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
1325
|
-
>>>
|
|
1331
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1332
|
+
... image = Image.open(BytesIO(response.read()))
|
|
1326
1333
|
|
|
1327
1334
|
>>> inputs = processor(
|
|
1328
1335
|
... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
|
|
@@ -148,13 +148,11 @@ class HeliumConfig(PreTrainedConfig):
|
|
|
148
148
|
self.mlp_bias = mlp_bias
|
|
149
149
|
self.rope_parameters = rope_parameters
|
|
150
150
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
**kwargs,
|
|
157
|
-
)
|
|
151
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
152
|
+
self.pad_token_id = pad_token_id
|
|
153
|
+
self.bos_token_id = bos_token_id
|
|
154
|
+
self.eos_token_id = eos_token_id
|
|
155
|
+
super().__init__(**kwargs)
|
|
158
156
|
|
|
159
157
|
|
|
160
158
|
__all__ = ["HeliumConfig"]
|
|
@@ -267,9 +267,9 @@ class HeliumAttention(nn.Module):
|
|
|
267
267
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
|
268
268
|
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
|
269
269
|
|
|
270
|
-
attention_interface: Callable =
|
|
271
|
-
|
|
272
|
-
|
|
270
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
271
|
+
self.config._attn_implementation, eager_attention_forward
|
|
272
|
+
)
|
|
273
273
|
|
|
274
274
|
attn_output, attn_weights = attention_interface(
|
|
275
275
|
self,
|
|
@@ -434,7 +434,7 @@ class HeliumModel(HeliumPreTrainedModel):
|
|
|
434
434
|
@auto_docstring
|
|
435
435
|
class HeliumForCausalLM(HeliumPreTrainedModel, GenerationMixin):
|
|
436
436
|
_tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
|
|
437
|
-
_tp_plan = {"lm_head": "
|
|
437
|
+
_tp_plan = {"lm_head": "colwise_gather_output"}
|
|
438
438
|
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
|
|
439
439
|
|
|
440
440
|
def __init__(self, config):
|
|
@@ -19,8 +19,8 @@
|
|
|
19
19
|
# limitations under the License.
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
from ...backbone_utils import BackboneConfigMixin
|
|
22
23
|
from ...configuration_utils import PreTrainedConfig
|
|
23
|
-
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
# TODO: Modular conversion for resnet must be fixed as
|
|
@@ -120,9 +120,7 @@ class HGNetV2Config(BackboneConfigMixin, PreTrainedConfig):
|
|
|
120
120
|
self.hidden_sizes = hidden_sizes
|
|
121
121
|
self.hidden_act = hidden_act
|
|
122
122
|
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
|
|
123
|
-
self.
|
|
124
|
-
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
|
|
125
|
-
)
|
|
123
|
+
self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features)
|
|
126
124
|
self.stem_channels = stem_channels
|
|
127
125
|
self.stage_in_channels = stage_in_channels
|
|
128
126
|
self.stage_mid_channels = stage_mid_channels
|
|
@@ -25,10 +25,10 @@ from torch import Tensor, nn
|
|
|
25
25
|
|
|
26
26
|
from ... import initialization as init
|
|
27
27
|
from ...activations import ACT2FN
|
|
28
|
+
from ...backbone_utils import BackboneMixin
|
|
28
29
|
from ...modeling_outputs import BackboneOutput, BaseModelOutputWithNoAttention, ImageClassifierOutputWithNoAttention
|
|
29
30
|
from ...modeling_utils import PreTrainedModel
|
|
30
31
|
from ...utils import auto_docstring
|
|
31
|
-
from ...utils.backbone_utils import BackboneMixin
|
|
32
32
|
from .configuration_hgnet_v2 import HGNetV2Config
|
|
33
33
|
|
|
34
34
|
|
|
@@ -338,12 +338,11 @@ class HGNetV2Encoder(nn.Module):
|
|
|
338
338
|
)
|
|
339
339
|
|
|
340
340
|
|
|
341
|
-
class HGNetV2Backbone(
|
|
341
|
+
class HGNetV2Backbone(BackboneMixin, HGNetV2PreTrainedModel):
|
|
342
342
|
has_attentions = False
|
|
343
343
|
|
|
344
344
|
def __init__(self, config: HGNetV2Config):
|
|
345
345
|
super().__init__(config)
|
|
346
|
-
super()._init_backbone(config)
|
|
347
346
|
self.depths = config.depths
|
|
348
347
|
self.num_features = [config.embedding_size] + config.hidden_sizes
|
|
349
348
|
self.embedder = HGNetV2Embeddings(config)
|
|
@@ -448,12 +447,14 @@ class HGNetV2ForImageClassification(HGNetV2PreTrainedModel):
|
|
|
448
447
|
Examples:
|
|
449
448
|
```python
|
|
450
449
|
>>> import torch
|
|
451
|
-
>>> import
|
|
450
|
+
>>> import httpx
|
|
451
|
+
>>> from io import BytesIO
|
|
452
452
|
>>> from transformers import HGNetV2ForImageClassification, AutoImageProcessor
|
|
453
453
|
>>> from PIL import Image
|
|
454
454
|
|
|
455
455
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
456
|
-
>>>
|
|
456
|
+
>>> with httpx.stream("GET", url) as response:
|
|
457
|
+
... image = Image.open(BytesIO(response.read()))
|
|
457
458
|
|
|
458
459
|
>>> model = HGNetV2ForImageClassification.from_pretrained("ustc-community/hgnet-v2")
|
|
459
460
|
>>> processor = AutoImageProcessor.from_pretrained("ustc-community/hgnet-v2")
|