transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -270,18 +270,15 @@ class DiaConfig(PreTrainedConfig):
|
|
|
270
270
|
self.delay_pattern = delay_pattern if delay_pattern is not None else [0, 8, 9, 10, 11, 12, 13, 14, 15]
|
|
271
271
|
self.initializer_range = initializer_range
|
|
272
272
|
self.use_cache = use_cache
|
|
273
|
+
self.decoder_config.pad_token_id = pad_token_id
|
|
274
|
+
self.decoder_config.eos_token_id = eos_token_id
|
|
275
|
+
self.decoder_config.bos_token_id = bos_token_id
|
|
273
276
|
|
|
274
277
|
assert self.decoder_config.num_channels == len(self.delay_pattern), (
|
|
275
278
|
"Number of channels must match delay pattern length."
|
|
276
279
|
)
|
|
277
280
|
|
|
278
|
-
super().__init__(
|
|
279
|
-
pad_token_id=pad_token_id,
|
|
280
|
-
eos_token_id=eos_token_id,
|
|
281
|
-
bos_token_id=bos_token_id,
|
|
282
|
-
is_encoder_decoder=is_encoder_decoder,
|
|
283
|
-
**kwargs,
|
|
284
|
-
)
|
|
281
|
+
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
|
|
285
282
|
|
|
286
283
|
def get_text_config(self, *args, **kwargs):
|
|
287
284
|
"""Defaulting to audio config as it's the decoder in this case which is usually the text backbone"""
|
|
@@ -69,7 +69,7 @@ class DiaGenerationMixin(GenerationMixin):
|
|
|
69
69
|
custom_processors.append(
|
|
70
70
|
DiaEOSChannelFilterLogitsProcessor(
|
|
71
71
|
num_channels=len(self.config.delay_pattern),
|
|
72
|
-
eos_token_id=self.config.eos_token_id,
|
|
72
|
+
eos_token_id=self.config.decoder_config.eos_token_id,
|
|
73
73
|
)
|
|
74
74
|
)
|
|
75
75
|
|
|
@@ -96,7 +96,7 @@ class DiaGenerationMixin(GenerationMixin):
|
|
|
96
96
|
merged_processors.append(
|
|
97
97
|
DiaEOSDelayPatternLogitsProcessor(
|
|
98
98
|
delay_pattern=self.config.delay_pattern,
|
|
99
|
-
eos_token_id=self.config.eos_token_id,
|
|
99
|
+
eos_token_id=self.config.decoder_config.eos_token_id,
|
|
100
100
|
max_generation_len=generation_config.max_length,
|
|
101
101
|
device=device,
|
|
102
102
|
)
|
|
@@ -188,7 +188,8 @@ class DiaGenerationMixin(GenerationMixin):
|
|
|
188
188
|
# 2. Determine the valid input and what works as mask within the input
|
|
189
189
|
delay_mask = decoder_input_ids.long()
|
|
190
190
|
valid_input_size = (
|
|
191
|
-
decoder_input_ids.shape[1]
|
|
191
|
+
decoder_input_ids.shape[1]
|
|
192
|
+
- (decoder_input_ids[:, :, 0] == self.config.decoder_config.pad_token_id).sum(dim=-1).max()
|
|
192
193
|
)
|
|
193
194
|
decoder_input_ids = delay_mask[:, :valid_input_size].transpose(1, 2).long()
|
|
194
195
|
decoder_attention_mask = decoder_attention_mask[:, :valid_input_size].long()
|
|
@@ -216,7 +217,7 @@ class DiaGenerationMixin(GenerationMixin):
|
|
|
216
217
|
# Post processing for CFG and overwriting via delay pattern mask
|
|
217
218
|
# 1. Delay pattern mask -- force tokens if not allowed to predict (!= pad_token in mask)
|
|
218
219
|
model_inputs["decoder_input_ids"] = self.apply_delay_mask(
|
|
219
|
-
input_ids, self.config.pad_token_id, decoder_delay_mask
|
|
220
|
+
input_ids, self.config.decoder_config.pad_token_id, decoder_delay_mask
|
|
220
221
|
)
|
|
221
222
|
|
|
222
223
|
# Depending on cache usage we need to pass all or just one
|
|
@@ -387,26 +388,16 @@ class DiaGenerationMixin(GenerationMixin):
|
|
|
387
388
|
# Prepare inner 2D logic in generation loop
|
|
388
389
|
input_ids = input_ids.reshape(-1, input_ids.shape[-1])
|
|
389
390
|
|
|
390
|
-
|
|
391
|
-
# prepare model inputs
|
|
392
|
-
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
|
393
|
-
|
|
394
|
-
# 10. Prefill
|
|
395
|
-
model_inputs.update({"output_attentions": generation_config.output_attentions})
|
|
396
|
-
model_inputs.update({"output_hidden_states": generation_config.output_hidden_states})
|
|
397
|
-
outputs = self(**model_inputs, return_dict=True)
|
|
398
|
-
|
|
399
|
-
# 11. expand input_ids with `num_return_sequences` additional sequences per batch
|
|
391
|
+
# 10. expand input_ids with `num_return_sequences` additional sequences per batch
|
|
400
392
|
if generation_config.num_return_sequences > 1:
|
|
401
393
|
raise ValueError("`num_return_sequences>1` is incompatible with Dia.")
|
|
402
394
|
|
|
403
|
-
#
|
|
395
|
+
# 11. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
|
|
404
396
|
return self._sample(
|
|
405
397
|
input_ids,
|
|
406
398
|
logits_processor=prepared_logits_processor,
|
|
407
399
|
stopping_criteria=prepared_stopping_criteria,
|
|
408
400
|
generation_config=generation_config,
|
|
409
|
-
prefill_outputs=outputs,
|
|
410
401
|
**generation_mode_kwargs,
|
|
411
402
|
**model_kwargs,
|
|
412
403
|
)
|
|
@@ -460,7 +451,7 @@ class DiaGenerationMixin(GenerationMixin):
|
|
|
460
451
|
output_sequences = output_sequences.reshape(bsz, num_channels, -1).transpose(1, 2)
|
|
461
452
|
|
|
462
453
|
# Apply delay mask
|
|
463
|
-
output_sequences = self.apply_delay_mask(output_sequences, self.config.pad_token_id, delay_mask)
|
|
454
|
+
output_sequences = self.apply_delay_mask(output_sequences, self.config.decoder_config.pad_token_id, delay_mask)
|
|
464
455
|
|
|
465
456
|
if return_dict_in_generate:
|
|
466
457
|
output.sequences = output_sequences
|
|
@@ -317,9 +317,9 @@ class DiaSelfAttention(nn.Module):
|
|
|
317
317
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
|
318
318
|
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
|
319
319
|
|
|
320
|
-
attention_interface: Callable =
|
|
321
|
-
|
|
322
|
-
|
|
320
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
321
|
+
self.config._attn_implementation, eager_attention_forward
|
|
322
|
+
)
|
|
323
323
|
|
|
324
324
|
attn_output, attn_weights = attention_interface(
|
|
325
325
|
self,
|
|
@@ -392,9 +392,9 @@ class DiaCrossAttention(nn.Module):
|
|
|
392
392
|
# set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
|
|
393
393
|
past_key_values.is_updated[self.layer_idx] = True
|
|
394
394
|
|
|
395
|
-
attention_interface: Callable =
|
|
396
|
-
|
|
397
|
-
|
|
395
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
396
|
+
self.config._attn_implementation, eager_attention_forward
|
|
397
|
+
)
|
|
398
398
|
|
|
399
399
|
attn_output, attn_weights = attention_interface(
|
|
400
400
|
self,
|
|
@@ -776,7 +776,7 @@ class DiaModel(DiaPreTrainedModel):
|
|
|
776
776
|
bsz, seq_len, channels = (encoder_outputs[0].shape[0], -1, self.config.decoder_config.num_channels)
|
|
777
777
|
if decoder_input_ids is None:
|
|
778
778
|
decoder_input_ids = torch.full(
|
|
779
|
-
size=(bsz, 1, channels), fill_value=self.config.bos_token_id, device=self.device
|
|
779
|
+
size=(bsz, 1, channels), fill_value=self.config.decoder_config.bos_token_id, device=self.device
|
|
780
780
|
)
|
|
781
781
|
# Ensure 3D
|
|
782
782
|
if decoder_input_ids.ndim == 2:
|
|
@@ -182,9 +182,9 @@ class DiaCrossAttention(nn.Module):
|
|
|
182
182
|
# set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
|
|
183
183
|
past_key_values.is_updated[self.layer_idx] = True
|
|
184
184
|
|
|
185
|
-
attention_interface: Callable =
|
|
186
|
-
|
|
187
|
-
|
|
185
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
186
|
+
self.config._attn_implementation, eager_attention_forward
|
|
187
|
+
)
|
|
188
188
|
|
|
189
189
|
attn_output, attn_weights = attention_interface(
|
|
190
190
|
self,
|
|
@@ -566,7 +566,7 @@ class DiaModel(DiaPreTrainedModel):
|
|
|
566
566
|
bsz, seq_len, channels = (encoder_outputs[0].shape[0], -1, self.config.decoder_config.num_channels)
|
|
567
567
|
if decoder_input_ids is None:
|
|
568
568
|
decoder_input_ids = torch.full(
|
|
569
|
-
size=(bsz, 1, channels), fill_value=self.config.bos_token_id, device=self.device
|
|
569
|
+
size=(bsz, 1, channels), fill_value=self.config.decoder_config.bos_token_id, device=self.device
|
|
570
570
|
)
|
|
571
571
|
# Ensure 3D
|
|
572
572
|
if decoder_input_ids.ndim == 2:
|
|
@@ -144,13 +144,11 @@ class DiffLlamaConfig(PreTrainedConfig):
|
|
|
144
144
|
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
|
|
145
145
|
self.rope_parameters = rope_parameters
|
|
146
146
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
**kwargs,
|
|
153
|
-
)
|
|
147
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
148
|
+
self.pad_token_id = pad_token_id
|
|
149
|
+
self.bos_token_id = bos_token_id
|
|
150
|
+
self.eos_token_id = eos_token_id
|
|
151
|
+
super().__init__(**kwargs)
|
|
154
152
|
|
|
155
153
|
|
|
156
154
|
__all__ = ["DiffLlamaConfig"]
|
|
@@ -351,12 +351,7 @@ class DiffLlamaFlashAttention2(DiffLlamaAttention):
|
|
|
351
351
|
device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
|
|
352
352
|
if input_dtype == torch.float32:
|
|
353
353
|
if torch.is_autocast_enabled():
|
|
354
|
-
|
|
355
|
-
target_dtype = (
|
|
356
|
-
torch.get_autocast_dtype(device_type)
|
|
357
|
-
if hasattr(torch, "get_autocast_dtype")
|
|
358
|
-
else torch.get_autocast_gpu_dtype()
|
|
359
|
-
)
|
|
354
|
+
target_dtype = torch.get_autocast_dtype(device_type)
|
|
360
355
|
# Handle the case where the model is quantized
|
|
361
356
|
elif hasattr(self.config, "_is_quantized"):
|
|
362
357
|
target_dtype = self.config.dtype
|
|
@@ -469,7 +464,7 @@ class DiffLlamaSdpaAttention(DiffLlamaAttention):
|
|
|
469
464
|
|
|
470
465
|
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
|
|
471
466
|
# Reference: https://github.com/pytorch/pytorch/issues/112577.
|
|
472
|
-
if query_states.device.type
|
|
467
|
+
if query_states.device.type in ["cuda", "xpu"] and causal_mask is not None:
|
|
473
468
|
query_states = query_states.contiguous()
|
|
474
469
|
key_states = key_states.contiguous()
|
|
475
470
|
value_states = value_states.contiguous()
|
|
@@ -689,7 +684,7 @@ class DiffLlamaModel(DiffLlamaPreTrainedModel):
|
|
|
689
684
|
@auto_docstring
|
|
690
685
|
class DiffLlamaForCausalLM(DiffLlamaPreTrainedModel, GenerationMixin):
|
|
691
686
|
_tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
|
|
692
|
-
_tp_plan = {"lm_head": "
|
|
687
|
+
_tp_plan = {"lm_head": "colwise_gather_output"}
|
|
693
688
|
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
|
|
694
689
|
|
|
695
690
|
def __init__(self, config):
|
|
@@ -227,12 +227,7 @@ class DiffLlamaFlashAttention2(DiffLlamaAttention):
|
|
|
227
227
|
device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
|
|
228
228
|
if input_dtype == torch.float32:
|
|
229
229
|
if torch.is_autocast_enabled():
|
|
230
|
-
|
|
231
|
-
target_dtype = (
|
|
232
|
-
torch.get_autocast_dtype(device_type)
|
|
233
|
-
if hasattr(torch, "get_autocast_dtype")
|
|
234
|
-
else torch.get_autocast_gpu_dtype()
|
|
235
|
-
)
|
|
230
|
+
target_dtype = torch.get_autocast_dtype(device_type)
|
|
236
231
|
# Handle the case where the model is quantized
|
|
237
232
|
elif hasattr(self.config, "_is_quantized"):
|
|
238
233
|
target_dtype = self.config.dtype
|
|
@@ -345,7 +340,7 @@ class DiffLlamaSdpaAttention(DiffLlamaAttention):
|
|
|
345
340
|
|
|
346
341
|
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
|
|
347
342
|
# Reference: https://github.com/pytorch/pytorch/issues/112577.
|
|
348
|
-
if query_states.device.type
|
|
343
|
+
if query_states.device.type in ["cuda", "xpu"] and causal_mask is not None:
|
|
349
344
|
query_states = query_states.contiguous()
|
|
350
345
|
key_states = key_states.contiguous()
|
|
351
346
|
value_states = value_states.contiguous()
|
|
@@ -13,9 +13,9 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""Dilated Neighborhood Attention Transformer model configuration"""
|
|
15
15
|
|
|
16
|
+
from ...backbone_utils import BackboneConfigMixin
|
|
16
17
|
from ...configuration_utils import PreTrainedConfig
|
|
17
18
|
from ...utils import logging
|
|
18
|
-
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
logger = logging.get_logger(__name__)
|
|
@@ -143,9 +143,7 @@ class DinatConfig(BackboneConfigMixin, PreTrainedConfig):
|
|
|
143
143
|
self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
|
|
144
144
|
self.layer_scale_init_value = layer_scale_init_value
|
|
145
145
|
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
|
|
146
|
-
self.
|
|
147
|
-
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
|
|
148
|
-
)
|
|
146
|
+
self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features)
|
|
149
147
|
|
|
150
148
|
|
|
151
149
|
__all__ = ["DinatConfig"]
|
|
@@ -20,6 +20,7 @@ import torch
|
|
|
20
20
|
from torch import nn
|
|
21
21
|
|
|
22
22
|
from ...activations import ACT2FN
|
|
23
|
+
from ...backbone_utils import BackboneMixin
|
|
23
24
|
from ...modeling_outputs import BackboneOutput
|
|
24
25
|
from ...modeling_utils import PreTrainedModel
|
|
25
26
|
from ...utils import (
|
|
@@ -30,7 +31,6 @@ from ...utils import (
|
|
|
30
31
|
logging,
|
|
31
32
|
requires_backends,
|
|
32
33
|
)
|
|
33
|
-
from ...utils.backbone_utils import BackboneMixin
|
|
34
34
|
from .configuration_dinat import DinatConfig
|
|
35
35
|
|
|
36
36
|
|
|
@@ -710,10 +710,9 @@ class DinatForImageClassification(DinatPreTrainedModel):
|
|
|
710
710
|
NAT backbone, to be used with frameworks like DETR and MaskFormer.
|
|
711
711
|
"""
|
|
712
712
|
)
|
|
713
|
-
class DinatBackbone(
|
|
713
|
+
class DinatBackbone(BackboneMixin, DinatPreTrainedModel):
|
|
714
714
|
def __init__(self, config):
|
|
715
715
|
super().__init__(config)
|
|
716
|
-
super()._init_backbone(config)
|
|
717
716
|
|
|
718
717
|
requires_backends(self, ["natten"])
|
|
719
718
|
|
|
@@ -723,7 +722,7 @@ class DinatBackbone(DinatPreTrainedModel, BackboneMixin):
|
|
|
723
722
|
|
|
724
723
|
# Add layer norms to hidden states of out_features
|
|
725
724
|
hidden_states_norms = {}
|
|
726
|
-
for stage, num_channels in zip(self.
|
|
725
|
+
for stage, num_channels in zip(self.out_features, self.channels):
|
|
727
726
|
hidden_states_norms[stage] = nn.LayerNorm(num_channels)
|
|
728
727
|
self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
|
|
729
728
|
|
|
@@ -749,10 +748,12 @@ class DinatBackbone(DinatPreTrainedModel, BackboneMixin):
|
|
|
749
748
|
>>> from transformers import AutoImageProcessor, AutoBackbone
|
|
750
749
|
>>> import torch
|
|
751
750
|
>>> from PIL import Image
|
|
752
|
-
>>> import
|
|
751
|
+
>>> import httpx
|
|
752
|
+
>>> from io import BytesIO
|
|
753
753
|
|
|
754
754
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
755
|
-
>>>
|
|
755
|
+
>>> with httpx.stream("GET", url) as response:
|
|
756
|
+
... image = Image.open(BytesIO(response.read()))
|
|
756
757
|
|
|
757
758
|
>>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
|
|
758
759
|
>>> model = AutoBackbone.from_pretrained(
|
|
@@ -13,9 +13,9 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""DINOv2 model configuration"""
|
|
15
15
|
|
|
16
|
+
from ...backbone_utils import BackboneConfigMixin
|
|
16
17
|
from ...configuration_utils import PreTrainedConfig
|
|
17
18
|
from ...utils import logging
|
|
18
|
-
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
logger = logging.get_logger(__name__)
|
|
@@ -145,9 +145,7 @@ class Dinov2Config(BackboneConfigMixin, PreTrainedConfig):
|
|
|
145
145
|
self.drop_path_rate = drop_path_rate
|
|
146
146
|
self.use_swiglu_ffn = use_swiglu_ffn
|
|
147
147
|
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
|
|
148
|
-
self.
|
|
149
|
-
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
|
|
150
|
-
)
|
|
148
|
+
self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features)
|
|
151
149
|
self.apply_layernorm = apply_layernorm
|
|
152
150
|
self.reshape_hidden_states = reshape_hidden_states
|
|
153
151
|
self.use_mask_token = use_mask_token
|
|
@@ -21,12 +21,12 @@ from torch import nn
|
|
|
21
21
|
|
|
22
22
|
from ... import initialization as init
|
|
23
23
|
from ...activations import ACT2FN
|
|
24
|
+
from ...backbone_utils import BackboneMixin
|
|
24
25
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
25
26
|
from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
|
|
26
27
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
27
28
|
from ...processing_utils import Unpack
|
|
28
29
|
from ...utils import TransformersKwargs, auto_docstring, logging, torch_int
|
|
29
|
-
from ...utils.backbone_utils import BackboneMixin
|
|
30
30
|
from ...utils.generic import can_return_tuple, check_model_inputs
|
|
31
31
|
from .configuration_dinov2 import Dinov2Config
|
|
32
32
|
|
|
@@ -208,9 +208,9 @@ class Dinov2SelfAttention(nn.Module):
|
|
|
208
208
|
value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
|
|
209
209
|
query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
|
|
210
210
|
|
|
211
|
-
attention_interface: Callable =
|
|
212
|
-
|
|
213
|
-
|
|
211
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
212
|
+
self.config._attn_implementation, eager_attention_forward
|
|
213
|
+
)
|
|
214
214
|
|
|
215
215
|
context_layer, attention_probs = attention_interface(
|
|
216
216
|
self,
|
|
@@ -544,10 +544,9 @@ class Dinov2ForImageClassification(Dinov2PreTrainedModel):
|
|
|
544
544
|
Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
|
|
545
545
|
"""
|
|
546
546
|
)
|
|
547
|
-
class Dinov2Backbone(
|
|
547
|
+
class Dinov2Backbone(BackboneMixin, Dinov2PreTrainedModel):
|
|
548
548
|
def __init__(self, config):
|
|
549
549
|
super().__init__(config)
|
|
550
|
-
super()._init_backbone(config)
|
|
551
550
|
|
|
552
551
|
self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
|
|
553
552
|
self.embeddings = Dinov2Embeddings(config)
|
|
@@ -573,10 +572,12 @@ class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
|
|
|
573
572
|
>>> from transformers import AutoImageProcessor, AutoBackbone
|
|
574
573
|
>>> import torch
|
|
575
574
|
>>> from PIL import Image
|
|
576
|
-
>>> import
|
|
575
|
+
>>> import httpx
|
|
576
|
+
>>> from io import BytesIO
|
|
577
577
|
|
|
578
578
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
579
|
-
>>>
|
|
579
|
+
>>> with httpx.stream("GET", url) as response:
|
|
580
|
+
... image = Image.open(BytesIO(response.read()))
|
|
580
581
|
|
|
581
582
|
>>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
|
|
582
583
|
>>> model = AutoBackbone.from_pretrained(
|
|
@@ -20,8 +20,8 @@
|
|
|
20
20
|
# limitations under the License.
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
from ...backbone_utils import BackboneConfigMixin
|
|
23
24
|
from ...configuration_utils import PreTrainedConfig
|
|
24
|
-
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class Dinov2WithRegistersConfig(BackboneConfigMixin, PreTrainedConfig):
|
|
@@ -149,9 +149,7 @@ class Dinov2WithRegistersConfig(BackboneConfigMixin, PreTrainedConfig):
|
|
|
149
149
|
self.use_swiglu_ffn = use_swiglu_ffn
|
|
150
150
|
self.num_register_tokens = num_register_tokens
|
|
151
151
|
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
|
|
152
|
-
self.
|
|
153
|
-
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
|
|
154
|
-
)
|
|
152
|
+
self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features)
|
|
155
153
|
self.apply_layernorm = apply_layernorm
|
|
156
154
|
self.reshape_hidden_states = reshape_hidden_states
|
|
157
155
|
|
|
@@ -28,12 +28,12 @@ from torch import nn
|
|
|
28
28
|
|
|
29
29
|
from ... import initialization as init
|
|
30
30
|
from ...activations import ACT2FN
|
|
31
|
+
from ...backbone_utils import BackboneMixin
|
|
31
32
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
32
33
|
from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
|
|
33
34
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
34
35
|
from ...processing_utils import Unpack
|
|
35
36
|
from ...utils import TransformersKwargs, auto_docstring, torch_int
|
|
36
|
-
from ...utils.backbone_utils import BackboneMixin
|
|
37
37
|
from ...utils.generic import can_return_tuple, check_model_inputs
|
|
38
38
|
from .configuration_dinov2_with_registers import Dinov2WithRegistersConfig
|
|
39
39
|
|
|
@@ -228,9 +228,9 @@ class Dinov2WithRegistersSelfAttention(nn.Module):
|
|
|
228
228
|
value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
|
|
229
229
|
query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
|
|
230
230
|
|
|
231
|
-
attention_interface: Callable =
|
|
232
|
-
|
|
233
|
-
|
|
231
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
232
|
+
self.config._attn_implementation, eager_attention_forward
|
|
233
|
+
)
|
|
234
234
|
|
|
235
235
|
context_layer, attention_probs = attention_interface(
|
|
236
236
|
self,
|
|
@@ -564,10 +564,9 @@ class Dinov2WithRegistersForImageClassification(Dinov2WithRegistersPreTrainedMod
|
|
|
564
564
|
Dinov2WithRegisters backbone, to be used with frameworks like DETR and MaskFormer.
|
|
565
565
|
"""
|
|
566
566
|
)
|
|
567
|
-
class Dinov2WithRegistersBackbone(
|
|
567
|
+
class Dinov2WithRegistersBackbone(BackboneMixin, Dinov2WithRegistersPreTrainedModel):
|
|
568
568
|
def __init__(self, config):
|
|
569
569
|
super().__init__(config)
|
|
570
|
-
super()._init_backbone(config)
|
|
571
570
|
self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
|
|
572
571
|
self.embeddings = Dinov2WithRegistersEmbeddings(config)
|
|
573
572
|
self.encoder = Dinov2WithRegistersEncoder(config)
|
|
@@ -597,10 +596,12 @@ class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMi
|
|
|
597
596
|
>>> from transformers import AutoImageProcessor, AutoBackbone
|
|
598
597
|
>>> import torch
|
|
599
598
|
>>> from PIL import Image
|
|
600
|
-
>>> import
|
|
599
|
+
>>> import httpx
|
|
600
|
+
>>> from io import BytesIO
|
|
601
601
|
|
|
602
602
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
603
|
-
>>>
|
|
603
|
+
>>> with httpx.stream("GET", url) as response:
|
|
604
|
+
... image = Image.open(BytesIO(response.read()))
|
|
604
605
|
|
|
605
606
|
>>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base")
|
|
606
607
|
>>> model = AutoBackbone.from_pretrained(
|
|
@@ -26,11 +26,11 @@ from ....transformers.models.dinov2.modeling_dinov2 import (
|
|
|
26
26
|
Dinov2PreTrainedModel,
|
|
27
27
|
)
|
|
28
28
|
from ... import initialization as init
|
|
29
|
+
from ...backbone_utils import BackboneConfigMixin
|
|
29
30
|
from ...configuration_utils import PreTrainedConfig
|
|
30
31
|
from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
|
|
31
32
|
from ...processing_utils import Unpack
|
|
32
33
|
from ...utils import TransformersKwargs, logging, torch_int
|
|
33
|
-
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
logger = logging.get_logger(__name__)
|
|
@@ -161,9 +161,7 @@ class Dinov2WithRegistersConfig(BackboneConfigMixin, PreTrainedConfig):
|
|
|
161
161
|
self.use_swiglu_ffn = use_swiglu_ffn
|
|
162
162
|
self.num_register_tokens = num_register_tokens
|
|
163
163
|
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
|
|
164
|
-
self.
|
|
165
|
-
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
|
|
166
|
-
)
|
|
164
|
+
self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features)
|
|
167
165
|
self.apply_layernorm = apply_layernorm
|
|
168
166
|
self.reshape_hidden_states = reshape_hidden_states
|
|
169
167
|
|
|
@@ -338,7 +336,6 @@ class Dinov2WithRegistersForImageClassification(Dinov2ForImageClassification):
|
|
|
338
336
|
class Dinov2WithRegistersBackbone(Dinov2Backbone):
|
|
339
337
|
def __init__(self, config):
|
|
340
338
|
super().__init__(config)
|
|
341
|
-
super()._init_backbone(config)
|
|
342
339
|
|
|
343
340
|
self.num_register_tokens = config.num_register_tokens
|
|
344
341
|
self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
|
|
@@ -366,10 +363,12 @@ class Dinov2WithRegistersBackbone(Dinov2Backbone):
|
|
|
366
363
|
>>> from transformers import AutoImageProcessor, AutoBackbone
|
|
367
364
|
>>> import torch
|
|
368
365
|
>>> from PIL import Image
|
|
369
|
-
>>> import
|
|
366
|
+
>>> import httpx
|
|
367
|
+
>>> from io import BytesIO
|
|
370
368
|
|
|
371
369
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
372
|
-
>>>
|
|
370
|
+
>>> with httpx.stream("GET", url) as response:
|
|
371
|
+
... image = Image.open(BytesIO(response.read()))
|
|
373
372
|
|
|
374
373
|
>>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base")
|
|
375
374
|
>>> model = AutoBackbone.from_pretrained(
|
|
@@ -13,9 +13,9 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""ConvNeXT model configuration"""
|
|
15
15
|
|
|
16
|
+
from ...backbone_utils import BackboneConfigMixin
|
|
16
17
|
from ...configuration_utils import PreTrainedConfig
|
|
17
18
|
from ...utils import logging
|
|
18
|
-
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
logger = logging.get_logger(__name__)
|
|
@@ -105,9 +105,7 @@ class DINOv3ConvNextConfig(BackboneConfigMixin, PreTrainedConfig):
|
|
|
105
105
|
self.drop_path_rate = drop_path_rate
|
|
106
106
|
self.image_size = image_size
|
|
107
107
|
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)]
|
|
108
|
-
self.
|
|
109
|
-
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
|
|
110
|
-
)
|
|
108
|
+
self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features)
|
|
111
109
|
|
|
112
110
|
@property
|
|
113
111
|
def num_stages(self) -> int:
|
|
@@ -19,10 +19,10 @@ from torch import nn
|
|
|
19
19
|
|
|
20
20
|
from ... import initialization as init
|
|
21
21
|
from ...activations import ACT2FN
|
|
22
|
+
from ...backbone_utils import BackboneMixin
|
|
22
23
|
from ...modeling_outputs import BackboneOutput, BaseModelOutputWithPoolingAndNoAttention
|
|
23
24
|
from ...modeling_utils import PreTrainedModel
|
|
24
25
|
from ...utils import auto_docstring, logging
|
|
25
|
-
from ...utils.backbone_utils import BackboneMixin
|
|
26
26
|
from ...utils.generic import can_return_tuple
|
|
27
27
|
from .configuration_dinov3_convnext import DINOv3ConvNextConfig
|
|
28
28
|
|
|
@@ -244,12 +244,11 @@ class DINOv3ConvNextModel(DINOv3ConvNextPreTrainedModel):
|
|
|
244
244
|
|
|
245
245
|
|
|
246
246
|
@auto_docstring
|
|
247
|
-
class DINOv3ConvNextBackbone(
|
|
247
|
+
class DINOv3ConvNextBackbone(BackboneMixin, DINOv3ConvNextPreTrainedModel):
|
|
248
248
|
config: DINOv3ConvNextConfig
|
|
249
249
|
|
|
250
250
|
def __init__(self, config: DINOv3ConvNextConfig):
|
|
251
251
|
super().__init__(config)
|
|
252
|
-
super()._init_backbone(config)
|
|
253
252
|
|
|
254
253
|
self.num_features = [config.num_channels] + list(config.hidden_sizes)
|
|
255
254
|
|
|
@@ -13,9 +13,9 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""DINOv3 model configuration"""
|
|
15
15
|
|
|
16
|
+
from ...backbone_utils import BackboneConfigMixin
|
|
16
17
|
from ...configuration_utils import PreTrainedConfig
|
|
17
18
|
from ...utils import logging
|
|
18
|
-
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
logger = logging.get_logger(__name__)
|
|
@@ -182,9 +182,7 @@ class DINOv3ViTConfig(BackboneConfigMixin, PreTrainedConfig):
|
|
|
182
182
|
self.stage_names = stage_names
|
|
183
183
|
|
|
184
184
|
# Initialize backbone features/indices
|
|
185
|
-
self.
|
|
186
|
-
out_features=out_features, out_indices=out_indices, stage_names=stage_names
|
|
187
|
-
)
|
|
185
|
+
self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features)
|
|
188
186
|
|
|
189
187
|
|
|
190
188
|
__all__ = ["DINOv3ViTConfig"]
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
from typing import Optional
|
|
17
17
|
|
|
18
18
|
import torch
|
|
19
|
-
|
|
19
|
+
import torchvision.transforms.v2.functional as tvF
|
|
20
20
|
|
|
21
21
|
from transformers.image_processing_base import BatchFeature
|
|
22
22
|
from transformers.image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
|
|
@@ -50,7 +50,7 @@ class DINOv3ViTImageProcessorFast(BaseImageProcessorFast):
|
|
|
50
50
|
images: list["torch.Tensor"],
|
|
51
51
|
do_resize: bool,
|
|
52
52
|
size: SizeDict,
|
|
53
|
-
interpolation: Optional["
|
|
53
|
+
interpolation: Optional["tvF.InterpolationMode"],
|
|
54
54
|
do_center_crop: bool,
|
|
55
55
|
crop_size: SizeDict,
|
|
56
56
|
do_rescale: bool,
|
|
@@ -27,13 +27,13 @@ from torch import nn
|
|
|
27
27
|
|
|
28
28
|
from ... import initialization as init
|
|
29
29
|
from ...activations import ACT2FN
|
|
30
|
+
from ...backbone_utils import BackboneMixin
|
|
30
31
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
31
32
|
from ...modeling_outputs import BackboneOutput, BaseModelOutputWithPooling
|
|
32
33
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
33
34
|
from ...processing_utils import Unpack
|
|
34
35
|
from ...pytorch_utils import compile_compatible_method_lru_cache
|
|
35
36
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
36
|
-
from ...utils.backbone_utils import BackboneMixin
|
|
37
37
|
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
38
38
|
from .configuration_dinov3_vit import DINOv3ViTConfig
|
|
39
39
|
|
|
@@ -295,9 +295,9 @@ class DINOv3ViTAttention(nn.Module):
|
|
|
295
295
|
cos, sin = position_embeddings
|
|
296
296
|
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
|
297
297
|
|
|
298
|
-
attention_interface: Callable =
|
|
299
|
-
|
|
300
|
-
|
|
298
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
299
|
+
self.config._attn_implementation, eager_attention_forward
|
|
300
|
+
)
|
|
301
301
|
|
|
302
302
|
attn_output, attn_weights = attention_interface(
|
|
303
303
|
self,
|
|
@@ -516,10 +516,9 @@ class DINOv3ViTModel(DINOv3ViTPreTrainedModel):
|
|
|
516
516
|
|
|
517
517
|
|
|
518
518
|
@auto_docstring
|
|
519
|
-
class DINOv3ViTBackbone(
|
|
519
|
+
class DINOv3ViTBackbone(BackboneMixin, DINOv3ViTPreTrainedModel):
|
|
520
520
|
def __init__(self, config):
|
|
521
521
|
super().__init__(config)
|
|
522
|
-
super()._init_backbone(config)
|
|
523
522
|
|
|
524
523
|
self.embeddings = DINOv3ViTEmbeddings(config)
|
|
525
524
|
self.rope_embeddings = DINOv3ViTRopePositionEmbedding(config)
|