transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -19,13 +19,9 @@
|
|
|
19
19
|
# limitations under the License.
|
|
20
20
|
import math
|
|
21
21
|
|
|
22
|
+
from ...backbone_utils import BackboneConfigMixin, consolidate_backbone_kwargs_to_config
|
|
22
23
|
from ...configuration_utils import PreTrainedConfig
|
|
23
|
-
from
|
|
24
|
-
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
|
25
|
-
from ..auto import CONFIG_MAPPING, AutoConfig
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
logger = logging.get_logger(__name__)
|
|
24
|
+
from ..auto import AutoConfig
|
|
29
25
|
|
|
30
26
|
|
|
31
27
|
class LwDetrViTConfig(BackboneConfigMixin, PreTrainedConfig):
|
|
@@ -33,7 +29,7 @@ class LwDetrViTConfig(BackboneConfigMixin, PreTrainedConfig):
|
|
|
33
29
|
This is the configuration class to store the configuration of a [`LwDetrViTModel`]. It is used to instantiate an
|
|
34
30
|
LW-DETR ViT model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
35
31
|
with the defaults will yield a similar configuration to that of the LW-DETR ViT
|
|
36
|
-
[
|
|
32
|
+
[AnnaZhang/lwdetr_small_60e_coco](https://huggingface.co/AnnaZhang/lwdetr_small_60e_coco) architecture.
|
|
37
33
|
|
|
38
34
|
LW-DETR ViT is the Vision Transformer backbone used in the LW-DETR model for real-time object detection. It features
|
|
39
35
|
interleaved window and global attention mechanisms to reduce computational complexity while maintaining high performance.
|
|
@@ -149,9 +145,7 @@ class LwDetrViTConfig(BackboneConfigMixin, PreTrainedConfig):
|
|
|
149
145
|
self.use_absolute_position_embeddings = use_absolute_position_embeddings
|
|
150
146
|
|
|
151
147
|
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)]
|
|
152
|
-
self.
|
|
153
|
-
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
|
|
154
|
-
)
|
|
148
|
+
self.set_output_features_output_indices(out_indices=out_indices, out_features=out_features)
|
|
155
149
|
|
|
156
150
|
self.cae_init_values = cae_init_values
|
|
157
151
|
if num_windows % math.sqrt(num_windows) != 0:
|
|
@@ -171,7 +165,7 @@ class LwDetrConfig(PreTrainedConfig):
|
|
|
171
165
|
This is the configuration class to store the configuration of a [`LwDetrModel`]. It is used to instantiate
|
|
172
166
|
a LW-DETR model according to the specified arguments, defining the model architecture. Instantiating a
|
|
173
167
|
configuration with the defaults will yield a similar configuration to that of the LW-DETR
|
|
174
|
-
[
|
|
168
|
+
[AnnaZhang/lwdetr_small_60e_coco](https://huggingface.co/AnnaZhang/lwdetr_small_60e_coco) architecture.
|
|
175
169
|
|
|
176
170
|
LW-DETR (Lightweight Detection Transformer) is a transformer-based object detection model designed for real-time
|
|
177
171
|
detection tasks. It replaces traditional CNN-based detectors like YOLO with a more efficient transformer architecture
|
|
@@ -253,10 +247,10 @@ class LwDetrConfig(PreTrainedConfig):
|
|
|
253
247
|
```python
|
|
254
248
|
>>> from transformers import LwDetrConfig, LwDetrModel
|
|
255
249
|
|
|
256
|
-
>>> # Initializing a LW-DETR
|
|
250
|
+
>>> # Initializing a LW-DETR AnnaZhang/lwdetr_small_60e_coco style configuration
|
|
257
251
|
>>> configuration = LwDetrConfig()
|
|
258
252
|
|
|
259
|
-
>>> # Initializing a model (with random weights) from the
|
|
253
|
+
>>> # Initializing a model (with random weights) from the AnnaZhang/lwdetr_small_60e_coco style configuration
|
|
260
254
|
>>> model = LwDetrModel(configuration)
|
|
261
255
|
|
|
262
256
|
>>> # Accessing the model configuration
|
|
@@ -308,24 +302,18 @@ class LwDetrConfig(PreTrainedConfig):
|
|
|
308
302
|
):
|
|
309
303
|
self.batch_norm_eps = batch_norm_eps
|
|
310
304
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
**kwargs,
|
|
324
|
-
)
|
|
325
|
-
elif isinstance(backbone_config, dict):
|
|
326
|
-
backbone_model_type = backbone_config.pop("model_type")
|
|
327
|
-
config_class = CONFIG_MAPPING[backbone_model_type]
|
|
328
|
-
backbone_config = config_class.from_dict(backbone_config)
|
|
305
|
+
backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
|
|
306
|
+
backbone_config=backbone_config,
|
|
307
|
+
default_config_type="lw_detr_vit",
|
|
308
|
+
default_config_kwargs={
|
|
309
|
+
"image_size": 1024,
|
|
310
|
+
"hidden_size": 192,
|
|
311
|
+
"num_hidden_layers": 10,
|
|
312
|
+
"window_block_indices": [0, 1, 3, 6, 7, 9],
|
|
313
|
+
"out_indices": [2, 4, 5, 9],
|
|
314
|
+
},
|
|
315
|
+
**kwargs,
|
|
316
|
+
)
|
|
329
317
|
|
|
330
318
|
self.backbone_config = backbone_config
|
|
331
319
|
# projector
|
|
@@ -30,14 +30,14 @@ from torch import Tensor, nn
|
|
|
30
30
|
|
|
31
31
|
from ... import initialization as init
|
|
32
32
|
from ...activations import ACT2CLS, ACT2FN
|
|
33
|
+
from ...backbone_utils import BackboneMixin
|
|
33
34
|
from ...integrations import use_kernel_forward_from_hub
|
|
34
35
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
35
|
-
from ...modeling_outputs import BackboneOutput
|
|
36
|
+
from ...modeling_outputs import BackboneOutput, BaseModelOutputWithCrossAttentions
|
|
36
37
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
37
38
|
from ...processing_utils import Unpack
|
|
38
39
|
from ...pytorch_utils import meshgrid
|
|
39
|
-
from ...utils import ModelOutput, TransformersKwargs, auto_docstring
|
|
40
|
-
from ...utils.backbone_utils import BackboneMixin
|
|
40
|
+
from ...utils import ModelOutput, TransformersKwargs, auto_docstring, torch_compilable_check
|
|
41
41
|
from ...utils.generic import check_model_inputs
|
|
42
42
|
from .configuration_lw_detr import LwDetrConfig, LwDetrViTConfig
|
|
43
43
|
|
|
@@ -114,9 +114,9 @@ class LwDetrViTSelfAttention(nn.Module):
|
|
|
114
114
|
value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
|
|
115
115
|
query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
|
|
116
116
|
|
|
117
|
-
attention_interface: Callable =
|
|
118
|
-
|
|
119
|
-
|
|
117
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
118
|
+
self.config._attn_implementation, eager_attention_forward
|
|
119
|
+
)
|
|
120
120
|
|
|
121
121
|
context_layer, attention_probs = attention_interface(
|
|
122
122
|
self,
|
|
@@ -367,10 +367,9 @@ class LwDetrViTPreTrainedModel(PreTrainedModel):
|
|
|
367
367
|
|
|
368
368
|
|
|
369
369
|
@auto_docstring()
|
|
370
|
-
class LwDetrViTBackbone(
|
|
370
|
+
class LwDetrViTBackbone(BackboneMixin, LwDetrViTPreTrainedModel):
|
|
371
371
|
def __init__(self, config):
|
|
372
372
|
super().__init__(config)
|
|
373
|
-
super()._init_backbone(config)
|
|
374
373
|
|
|
375
374
|
self.embeddings = LwDetrViTEmbeddings(config)
|
|
376
375
|
self.encoder = LwDetrViTEncoder(config)
|
|
@@ -692,9 +691,9 @@ class LwDetrAttention(nn.Module):
|
|
|
692
691
|
key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
|
|
693
692
|
value_states = self.v_proj(hidden_states_original).view(hidden_shape).transpose(1, 2)
|
|
694
693
|
|
|
695
|
-
attention_interface: Callable =
|
|
696
|
-
|
|
697
|
-
|
|
694
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
695
|
+
self.config._attn_implementation, eager_attention_forward
|
|
696
|
+
)
|
|
698
697
|
|
|
699
698
|
attn_output, attn_weights = attention_interface(
|
|
700
699
|
self,
|
|
@@ -807,9 +806,6 @@ class LwDetrMultiscaleDeformableAttention(nn.Module):
|
|
|
807
806
|
|
|
808
807
|
self.disable_custom_kernels = config.disable_custom_kernels
|
|
809
808
|
|
|
810
|
-
def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Tensor | None):
|
|
811
|
-
return tensor if position_embeddings is None else tensor + position_embeddings
|
|
812
|
-
|
|
813
809
|
def forward(
|
|
814
810
|
self,
|
|
815
811
|
hidden_states: torch.Tensor,
|
|
@@ -822,18 +818,18 @@ class LwDetrMultiscaleDeformableAttention(nn.Module):
|
|
|
822
818
|
spatial_shapes_list=None,
|
|
823
819
|
level_start_index=None,
|
|
824
820
|
**kwargs: Unpack[TransformersKwargs],
|
|
825
|
-
):
|
|
821
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
826
822
|
# add position embeddings to the hidden states before projecting to queries and keys
|
|
827
823
|
if position_embeddings is not None:
|
|
828
|
-
hidden_states =
|
|
824
|
+
hidden_states = hidden_states + position_embeddings
|
|
829
825
|
|
|
830
826
|
batch_size, num_queries, _ = hidden_states.shape
|
|
831
827
|
batch_size, sequence_length, _ = encoder_hidden_states.shape
|
|
832
828
|
total_elements = sum(height * width for height, width in spatial_shapes_list)
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
829
|
+
torch_compilable_check(
|
|
830
|
+
total_elements == sequence_length,
|
|
831
|
+
"Make sure to align the spatial shapes with the sequence length of the encoder hidden states",
|
|
832
|
+
)
|
|
837
833
|
|
|
838
834
|
value = self.value_proj(encoder_hidden_states)
|
|
839
835
|
if attention_mask is not None:
|
|
@@ -1027,24 +1023,22 @@ class LwDetrPreTrainedModel(PreTrainedModel):
|
|
|
1027
1023
|
- a stacked tensor of intermediate reference points.
|
|
1028
1024
|
"""
|
|
1029
1025
|
)
|
|
1030
|
-
class LwDetrDecoderOutput(
|
|
1026
|
+
class LwDetrDecoderOutput(BaseModelOutputWithCrossAttentions):
|
|
1031
1027
|
r"""
|
|
1032
|
-
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
|
1033
|
-
Stacked intermediate hidden states (output of each layer of the decoder).
|
|
1034
|
-
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
|
1035
|
-
Stacked intermediate reference points (reference points of each layer of the decoder).
|
|
1036
1028
|
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
|
1037
1029
|
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
|
1038
1030
|
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
|
1039
1031
|
used to compute the weighted average in the cross-attention heads.
|
|
1032
|
+
intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
|
|
1033
|
+
Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
|
|
1034
|
+
layernorm.
|
|
1035
|
+
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
|
1036
|
+
Stacked intermediate reference points (reference points of each layer of the decoder).
|
|
1040
1037
|
"""
|
|
1041
1038
|
|
|
1042
|
-
last_hidden_state: torch.FloatTensor | None = None
|
|
1043
1039
|
intermediate_hidden_states: torch.FloatTensor | None = None
|
|
1040
|
+
|
|
1044
1041
|
intermediate_reference_points: torch.FloatTensor | None = None
|
|
1045
|
-
hidden_states: tuple[torch.FloatTensor] | None = None
|
|
1046
|
-
attentions: tuple[torch.FloatTensor] | None = None
|
|
1047
|
-
cross_attentions: tuple[torch.FloatTensor] | None = None
|
|
1048
1042
|
|
|
1049
1043
|
|
|
1050
1044
|
# function to generate sine positional embedding for 4d coordinates
|
|
@@ -1244,11 +1238,11 @@ class LwDetrModel(LwDetrPreTrainedModel):
|
|
|
1244
1238
|
self.post_init()
|
|
1245
1239
|
|
|
1246
1240
|
def freeze_backbone(self):
|
|
1247
|
-
for name, param in self.backbone.
|
|
1241
|
+
for name, param in self.backbone.model.named_parameters():
|
|
1248
1242
|
param.requires_grad_(False)
|
|
1249
1243
|
|
|
1250
1244
|
def unfreeze_backbone(self):
|
|
1251
|
-
for name, param in self.backbone.
|
|
1245
|
+
for name, param in self.backbone.model.named_parameters():
|
|
1252
1246
|
param.requires_grad_(True)
|
|
1253
1247
|
|
|
1254
1248
|
def get_valid_ratio(self, mask, dtype=torch.float32):
|
|
@@ -1269,15 +1263,18 @@ class LwDetrModel(LwDetrPreTrainedModel):
|
|
|
1269
1263
|
temperature = 10000
|
|
1270
1264
|
scale = 2 * math.pi
|
|
1271
1265
|
|
|
1272
|
-
|
|
1266
|
+
# Compute position embeddings in float32 to avoid overflow with large temperature values in fp16
|
|
1267
|
+
proposals_dtype = proposals.dtype
|
|
1268
|
+
dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
|
|
1273
1269
|
dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
|
|
1274
1270
|
# batch_size, num_queries, 4
|
|
1275
|
-
proposals = proposals.sigmoid() * scale
|
|
1271
|
+
proposals = proposals.sigmoid().to(torch.float32) * scale
|
|
1276
1272
|
# batch_size, num_queries, 4, 128
|
|
1277
1273
|
pos = proposals[:, :, :, None] / dim_t
|
|
1278
1274
|
# batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
|
|
1279
1275
|
pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
|
|
1280
|
-
|
|
1276
|
+
# Convert back to target dtype after all computations are done
|
|
1277
|
+
return pos.to(proposals_dtype)
|
|
1281
1278
|
|
|
1282
1279
|
def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
|
|
1283
1280
|
"""Generate the encoder output proposals from encoded enc_output.
|
|
@@ -1352,13 +1349,15 @@ class LwDetrModel(LwDetrPreTrainedModel):
|
|
|
1352
1349
|
```python
|
|
1353
1350
|
>>> from transformers import AutoImageProcessor, DeformableDetrModel
|
|
1354
1351
|
>>> from PIL import Image
|
|
1355
|
-
>>> import
|
|
1352
|
+
>>> import httpx
|
|
1353
|
+
>>> from io import BytesIO
|
|
1356
1354
|
|
|
1357
1355
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
1358
|
-
>>>
|
|
1356
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1357
|
+
... image = Image.open(BytesIO(response.read()))
|
|
1359
1358
|
|
|
1360
|
-
>>> image_processor = AutoImageProcessor.from_pretrained("
|
|
1361
|
-
>>> model = DeformableDetrModel.from_pretrained("
|
|
1359
|
+
>>> image_processor = AutoImageProcessor.from_pretrained("AnnaZhang/lwdetr_small_60e_coco")
|
|
1360
|
+
>>> model = DeformableDetrModel.from_pretrained("AnnaZhang/lwdetr_small_60e_coco")
|
|
1362
1361
|
|
|
1363
1362
|
>>> inputs = image_processor(images=image, return_tensors="pt")
|
|
1364
1363
|
|
|
@@ -1487,8 +1486,6 @@ class LwDetrMLPPredictionHead(nn.Module):
|
|
|
1487
1486
|
Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
|
|
1488
1487
|
height and width of a bounding box w.r.t. an image.
|
|
1489
1488
|
|
|
1490
|
-
Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
|
|
1491
|
-
|
|
1492
1489
|
"""
|
|
1493
1490
|
|
|
1494
1491
|
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
|
|
@@ -1604,13 +1601,15 @@ class LwDetrForObjectDetection(LwDetrPreTrainedModel):
|
|
|
1604
1601
|
```python
|
|
1605
1602
|
>>> from transformers import AutoImageProcessor, LwDetrForObjectDetection
|
|
1606
1603
|
>>> from PIL import Image
|
|
1607
|
-
>>> import
|
|
1604
|
+
>>> import httpx
|
|
1605
|
+
>>> from io import BytesIO
|
|
1608
1606
|
|
|
1609
1607
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
1610
|
-
>>>
|
|
1608
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1609
|
+
... image = Image.open(BytesIO(response.read()))
|
|
1611
1610
|
|
|
1612
|
-
>>> image_processor = AutoImageProcessor.from_pretrained("
|
|
1613
|
-
>>> model = LwDetrForObjectDetection.from_pretrained("
|
|
1611
|
+
>>> image_processor = AutoImageProcessor.from_pretrained("AnnaZhang/lwdetr_small_60e_coco")
|
|
1612
|
+
>>> model = LwDetrForObjectDetection.from_pretrained("AnnaZhang/lwdetr_small_60e_coco")
|
|
1614
1613
|
|
|
1615
1614
|
>>> inputs = image_processor(images=image, return_tensors="pt")
|
|
1616
1615
|
>>> outputs = model(**inputs)
|
|
@@ -21,6 +21,7 @@ from torch import nn
|
|
|
21
21
|
|
|
22
22
|
from ... import initialization as init
|
|
23
23
|
from ...activations import ACT2FN
|
|
24
|
+
from ...backbone_utils import consolidate_backbone_kwargs_to_config
|
|
24
25
|
from ...configuration_utils import PreTrainedConfig
|
|
25
26
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
26
27
|
from ...modeling_outputs import BackboneOutput
|
|
@@ -29,7 +30,7 @@ from ...processing_utils import Unpack
|
|
|
29
30
|
from ...pytorch_utils import meshgrid
|
|
30
31
|
from ...utils import ModelOutput, TransformersKwargs, auto_docstring, logging
|
|
31
32
|
from ...utils.generic import check_model_inputs
|
|
32
|
-
from ..auto
|
|
33
|
+
from ..auto import AutoConfig
|
|
33
34
|
from ..convnext.modeling_convnext import ConvNextLayerNorm
|
|
34
35
|
from ..dab_detr.modeling_dab_detr import gen_sine_position_embeddings
|
|
35
36
|
from ..deformable_detr.modeling_deformable_detr import (
|
|
@@ -40,7 +41,6 @@ from ..deformable_detr.modeling_deformable_detr import (
|
|
|
40
41
|
DeformableDetrMultiscaleDeformableAttention,
|
|
41
42
|
)
|
|
42
43
|
from ..llama.modeling_llama import eager_attention_forward
|
|
43
|
-
from ..rt_detr.configuration_rt_detr import CONFIG_MAPPING
|
|
44
44
|
from ..rt_detr.modeling_rt_detr import RTDetrConvNormLayer
|
|
45
45
|
from ..vit.modeling_vit import ViTAttention, ViTEncoder, ViTSelfAttention
|
|
46
46
|
from ..vitdet.configuration_vitdet import VitDetConfig
|
|
@@ -60,7 +60,7 @@ class LwDetrViTConfig(VitDetConfig):
|
|
|
60
60
|
This is the configuration class to store the configuration of a [`LwDetrViTModel`]. It is used to instantiate an
|
|
61
61
|
LW-DETR ViT model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
62
62
|
with the defaults will yield a similar configuration to that of the LW-DETR ViT
|
|
63
|
-
[
|
|
63
|
+
[AnnaZhang/lwdetr_small_60e_coco](https://huggingface.co/AnnaZhang/lwdetr_small_60e_coco) architecture.
|
|
64
64
|
|
|
65
65
|
LW-DETR ViT is the Vision Transformer backbone used in the LW-DETR model for real-time object detection. It features
|
|
66
66
|
interleaved window and global attention mechanisms to reduce computational complexity while maintaining high performance.
|
|
@@ -200,7 +200,7 @@ class LwDetrConfig(PreTrainedConfig):
|
|
|
200
200
|
This is the configuration class to store the configuration of a [`LwDetrModel`]. It is used to instantiate
|
|
201
201
|
a LW-DETR model according to the specified arguments, defining the model architecture. Instantiating a
|
|
202
202
|
configuration with the defaults will yield a similar configuration to that of the LW-DETR
|
|
203
|
-
[
|
|
203
|
+
[AnnaZhang/lwdetr_small_60e_coco](https://huggingface.co/AnnaZhang/lwdetr_small_60e_coco) architecture.
|
|
204
204
|
|
|
205
205
|
LW-DETR (Lightweight Detection Transformer) is a transformer-based object detection model designed for real-time
|
|
206
206
|
detection tasks. It replaces traditional CNN-based detectors like YOLO with a more efficient transformer architecture
|
|
@@ -282,10 +282,10 @@ class LwDetrConfig(PreTrainedConfig):
|
|
|
282
282
|
```python
|
|
283
283
|
>>> from transformers import LwDetrConfig, LwDetrModel
|
|
284
284
|
|
|
285
|
-
>>> # Initializing a LW-DETR
|
|
285
|
+
>>> # Initializing a LW-DETR AnnaZhang/lwdetr_small_60e_coco style configuration
|
|
286
286
|
>>> configuration = LwDetrConfig()
|
|
287
287
|
|
|
288
|
-
>>> # Initializing a model (with random weights) from the
|
|
288
|
+
>>> # Initializing a model (with random weights) from the AnnaZhang/lwdetr_small_60e_coco style configuration
|
|
289
289
|
>>> model = LwDetrModel(configuration)
|
|
290
290
|
|
|
291
291
|
>>> # Accessing the model configuration
|
|
@@ -337,24 +337,18 @@ class LwDetrConfig(PreTrainedConfig):
|
|
|
337
337
|
):
|
|
338
338
|
self.batch_norm_eps = batch_norm_eps
|
|
339
339
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
**kwargs,
|
|
353
|
-
)
|
|
354
|
-
elif isinstance(backbone_config, dict):
|
|
355
|
-
backbone_model_type = backbone_config.pop("model_type")
|
|
356
|
-
config_class = CONFIG_MAPPING[backbone_model_type]
|
|
357
|
-
backbone_config = config_class.from_dict(backbone_config)
|
|
340
|
+
backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
|
|
341
|
+
backbone_config=backbone_config,
|
|
342
|
+
default_config_type="lw_detr_vit",
|
|
343
|
+
default_config_kwargs={
|
|
344
|
+
"image_size": 1024,
|
|
345
|
+
"hidden_size": 192,
|
|
346
|
+
"num_hidden_layers": 10,
|
|
347
|
+
"window_block_indices": [0, 1, 3, 6, 7, 9],
|
|
348
|
+
"out_indices": [2, 4, 5, 9],
|
|
349
|
+
},
|
|
350
|
+
**kwargs,
|
|
351
|
+
)
|
|
358
352
|
|
|
359
353
|
self.backbone_config = backbone_config
|
|
360
354
|
# projector
|
|
@@ -420,9 +414,9 @@ class LwDetrViTSelfAttention(ViTSelfAttention):
|
|
|
420
414
|
value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
|
|
421
415
|
query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
|
|
422
416
|
|
|
423
|
-
attention_interface: Callable =
|
|
424
|
-
|
|
425
|
-
|
|
417
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
418
|
+
self.config._attn_implementation, eager_attention_forward
|
|
419
|
+
)
|
|
426
420
|
|
|
427
421
|
context_layer, attention_probs = attention_interface(
|
|
428
422
|
self,
|
|
@@ -851,9 +845,9 @@ class LwDetrAttention(nn.Module):
|
|
|
851
845
|
key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
|
|
852
846
|
value_states = self.v_proj(hidden_states_original).view(hidden_shape).transpose(1, 2)
|
|
853
847
|
|
|
854
|
-
attention_interface: Callable =
|
|
855
|
-
|
|
856
|
-
|
|
848
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
849
|
+
self.config._attn_implementation, eager_attention_forward
|
|
850
|
+
)
|
|
857
851
|
|
|
858
852
|
attn_output, attn_weights = attention_interface(
|
|
859
853
|
self,
|
|
@@ -1283,13 +1277,15 @@ class LwDetrModel(DeformableDetrModel):
|
|
|
1283
1277
|
```python
|
|
1284
1278
|
>>> from transformers import AutoImageProcessor, DeformableDetrModel
|
|
1285
1279
|
>>> from PIL import Image
|
|
1286
|
-
>>> import
|
|
1280
|
+
>>> import httpx
|
|
1281
|
+
>>> from io import BytesIO
|
|
1287
1282
|
|
|
1288
1283
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
1289
|
-
>>>
|
|
1284
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1285
|
+
... image = Image.open(BytesIO(response.read()))
|
|
1290
1286
|
|
|
1291
|
-
>>> image_processor = AutoImageProcessor.from_pretrained("
|
|
1292
|
-
>>> model = DeformableDetrModel.from_pretrained("
|
|
1287
|
+
>>> image_processor = AutoImageProcessor.from_pretrained("AnnaZhang/lwdetr_small_60e_coco")
|
|
1288
|
+
>>> model = DeformableDetrModel.from_pretrained("AnnaZhang/lwdetr_small_60e_coco")
|
|
1293
1289
|
|
|
1294
1290
|
>>> inputs = image_processor(images=image, return_tensors="pt")
|
|
1295
1291
|
|
|
@@ -1515,13 +1511,15 @@ class LwDetrForObjectDetection(DeformableDetrForObjectDetection):
|
|
|
1515
1511
|
```python
|
|
1516
1512
|
>>> from transformers import AutoImageProcessor, LwDetrForObjectDetection
|
|
1517
1513
|
>>> from PIL import Image
|
|
1518
|
-
>>> import
|
|
1514
|
+
>>> import httpx
|
|
1515
|
+
>>> from io import BytesIO
|
|
1519
1516
|
|
|
1520
1517
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
1521
|
-
>>>
|
|
1518
|
+
>>> with httpx.stream("GET", url) as response:
|
|
1519
|
+
... image = Image.open(BytesIO(response.read()))
|
|
1522
1520
|
|
|
1523
|
-
>>> image_processor = AutoImageProcessor.from_pretrained("
|
|
1524
|
-
>>> model = LwDetrForObjectDetection.from_pretrained("
|
|
1521
|
+
>>> image_processor = AutoImageProcessor.from_pretrained("AnnaZhang/lwdetr_small_60e_coco")
|
|
1522
|
+
>>> model = LwDetrForObjectDetection.from_pretrained("AnnaZhang/lwdetr_small_60e_coco")
|
|
1525
1523
|
|
|
1526
1524
|
>>> inputs = image_processor(images=image, return_tensors="pt")
|
|
1527
1525
|
>>> outputs = model(**inputs)
|
|
@@ -96,6 +96,14 @@ class LxmertConfig(PreTrainedConfig):
|
|
|
96
96
|
Whether or not to calculate the attribute-prediction loss objective
|
|
97
97
|
visual_feat_loss (`bool`, *optional*, defaults to `True`):
|
|
98
98
|
Whether or not to calculate the feature-regression loss objective
|
|
99
|
+
pad_token_id (`int`, *optional*):
|
|
100
|
+
Padding token id.
|
|
101
|
+
bos_token_id (`int`, *optional*):
|
|
102
|
+
Beginning of stream token id.
|
|
103
|
+
eos_token_id (`int`, *optional*):
|
|
104
|
+
End of stream token id.
|
|
105
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
|
106
|
+
Whether to tie weight embeddings
|
|
99
107
|
"""
|
|
100
108
|
|
|
101
109
|
model_type = "lxmert"
|
|
@@ -129,6 +137,10 @@ class LxmertConfig(PreTrainedConfig):
|
|
|
129
137
|
visual_obj_loss=True,
|
|
130
138
|
visual_attr_loss=True,
|
|
131
139
|
visual_feat_loss=True,
|
|
140
|
+
pad_token_id=None,
|
|
141
|
+
bos_token_id=None,
|
|
142
|
+
eos_token_id=None,
|
|
143
|
+
tie_word_embeddings=True,
|
|
132
144
|
**kwargs,
|
|
133
145
|
):
|
|
134
146
|
self.vocab_size = vocab_size
|
|
@@ -157,6 +169,10 @@ class LxmertConfig(PreTrainedConfig):
|
|
|
157
169
|
self.visual_obj_loss = visual_obj_loss
|
|
158
170
|
self.visual_attr_loss = visual_attr_loss
|
|
159
171
|
self.visual_feat_loss = visual_feat_loss
|
|
172
|
+
self.pad_token_id = pad_token_id
|
|
173
|
+
self.bos_token_id = bos_token_id
|
|
174
|
+
self.eos_token_id = eos_token_id
|
|
175
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
160
176
|
self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers}
|
|
161
177
|
super().__init__(**kwargs)
|
|
162
178
|
|
|
@@ -118,6 +118,7 @@ class M2M100Config(PreTrainedConfig):
|
|
|
118
118
|
pad_token_id=1,
|
|
119
119
|
bos_token_id=0,
|
|
120
120
|
eos_token_id=2,
|
|
121
|
+
tie_word_embeddings=True,
|
|
121
122
|
**kwargs,
|
|
122
123
|
):
|
|
123
124
|
self.vocab_size = vocab_size
|
|
@@ -140,14 +141,12 @@ class M2M100Config(PreTrainedConfig):
|
|
|
140
141
|
self.num_hidden_layers = encoder_layers
|
|
141
142
|
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
|
142
143
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
**kwargs,
|
|
150
|
-
)
|
|
144
|
+
self.pad_token_id = pad_token_id
|
|
145
|
+
self.bos_token_id = bos_token_id
|
|
146
|
+
self.eos_token_id = eos_token_id
|
|
147
|
+
self.decoder_start_token_id = decoder_start_token_id
|
|
148
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
149
|
+
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
|
|
151
150
|
|
|
152
151
|
|
|
153
152
|
__all__ = ["M2M100Config"]
|
|
@@ -313,9 +313,9 @@ class M2M100Attention(nn.Module):
|
|
|
313
313
|
if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
|
|
314
314
|
past_key_values.is_updated[self.layer_idx] = True
|
|
315
315
|
|
|
316
|
-
attention_interface: Callable =
|
|
317
|
-
|
|
318
|
-
|
|
316
|
+
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
|
|
317
|
+
self.config._attn_implementation, eager_attention_forward
|
|
318
|
+
)
|
|
319
319
|
|
|
320
320
|
attn_output, attn_weights = attention_interface(
|
|
321
321
|
self,
|
|
@@ -80,7 +80,8 @@ class MambaConfig(PreTrainedConfig):
|
|
|
80
80
|
Whether or not the cache should be used.
|
|
81
81
|
use_mambapy (`bool`, *optional*, defaults to `False`):
|
|
82
82
|
Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not available. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
|
|
83
|
-
|
|
83
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
|
84
|
+
Whether to tie weight embeddings
|
|
84
85
|
|
|
85
86
|
Example:
|
|
86
87
|
|
|
@@ -125,6 +126,7 @@ class MambaConfig(PreTrainedConfig):
|
|
|
125
126
|
rescale_prenorm_residual=False,
|
|
126
127
|
use_cache=True,
|
|
127
128
|
use_mambapy=False,
|
|
129
|
+
tie_word_embeddings=True,
|
|
128
130
|
**kwargs,
|
|
129
131
|
):
|
|
130
132
|
self.vocab_size = vocab_size
|
|
@@ -152,8 +154,9 @@ class MambaConfig(PreTrainedConfig):
|
|
|
152
154
|
self.residual_in_fp32 = residual_in_fp32
|
|
153
155
|
self.use_cache = use_cache
|
|
154
156
|
self.use_mambapy = use_mambapy
|
|
157
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
155
158
|
|
|
156
|
-
super().__init__(
|
|
159
|
+
super().__init__(**kwargs)
|
|
157
160
|
|
|
158
161
|
|
|
159
162
|
__all__ = ["MambaConfig"]
|
|
@@ -752,41 +752,33 @@ class MambaForCausalLM(MambaPreTrainedModel, GenerationMixin):
|
|
|
752
752
|
is_first_iteration: bool | None = False,
|
|
753
753
|
**kwargs,
|
|
754
754
|
):
|
|
755
|
-
# Overwritten --
|
|
756
|
-
model_inputs =
|
|
755
|
+
# Overwritten -- has custom cache class `MambaCache`
|
|
756
|
+
model_inputs = super().prepare_inputs_for_generation(
|
|
757
|
+
input_ids,
|
|
758
|
+
inputs_embeds=inputs_embeds,
|
|
759
|
+
use_cache=use_cache,
|
|
760
|
+
cache_params=cache_params,
|
|
761
|
+
cache_position=cache_position,
|
|
762
|
+
attention_mask=attention_mask,
|
|
763
|
+
is_first_iteration=is_first_iteration,
|
|
764
|
+
**kwargs,
|
|
765
|
+
)
|
|
766
|
+
|
|
757
767
|
if use_cache and cache_params is None:
|
|
758
768
|
# we initialize the `cache_position` to full size of `conv_states` at prefill stage
|
|
759
769
|
# considering padding will be applied when input length is shorter, and truncation
|
|
760
770
|
# will be applied when it is longer, so it will be equivalent to always have it match
|
|
761
771
|
# the length of `cache_params.conv_states`, which is `config.conv_kernel`
|
|
762
|
-
cache_position = torch.arange(0, self.backbone.config.conv_kernel, device=input_ids.device)
|
|
772
|
+
model_inputs["cache_position"] = torch.arange(0, self.backbone.config.conv_kernel, device=input_ids.device)
|
|
763
773
|
if inputs_embeds is not None:
|
|
764
|
-
model_inputs = {"inputs_embeds": inputs_embeds}
|
|
765
774
|
max_batch_size = inputs_embeds.size(0)
|
|
766
775
|
else:
|
|
767
776
|
max_batch_size = input_ids.size(0)
|
|
768
|
-
cache_params = MambaCache(
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
attention_mask = None
|
|
773
|
-
|
|
774
|
-
if not use_cache and inputs_embeds is not None:
|
|
775
|
-
model_inputs = {"inputs_embeds": inputs_embeds}
|
|
776
|
-
|
|
777
|
-
model_inputs.update(
|
|
778
|
-
{
|
|
779
|
-
"cache_params": cache_params,
|
|
780
|
-
"use_cache": use_cache,
|
|
781
|
-
"cache_position": cache_position,
|
|
782
|
-
"attention_mask": attention_mask,
|
|
783
|
-
}
|
|
784
|
-
)
|
|
785
|
-
|
|
786
|
-
# Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
|
|
787
|
-
for key, value in kwargs.items():
|
|
788
|
-
if key not in model_inputs:
|
|
789
|
-
model_inputs[key] = value
|
|
777
|
+
model_inputs["cache_params"] = MambaCache(
|
|
778
|
+
self.backbone.config, max_batch_size, device=self.device, dtype=self.dtype
|
|
779
|
+
)
|
|
780
|
+
elif use_cache and cache_position[0] > 0:
|
|
781
|
+
model_inputs["attention_mask"] = None
|
|
790
782
|
|
|
791
783
|
return model_inputs
|
|
792
784
|
|
|
@@ -177,13 +177,11 @@ class Mamba2Config(PreTrainedConfig):
|
|
|
177
177
|
self.time_step_limit = time_step_limit
|
|
178
178
|
self.tie_word_embeddings = tie_word_embeddings
|
|
179
179
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
**kwargs,
|
|
186
|
-
)
|
|
180
|
+
self.bos_token_id = bos_token_id
|
|
181
|
+
self.eos_token_id = eos_token_id
|
|
182
|
+
self.pad_token_id = pad_token_id
|
|
183
|
+
self.tie_word_embeddings = tie_word_embeddings
|
|
184
|
+
super().__init__(**kwargs)
|
|
187
185
|
|
|
188
186
|
|
|
189
187
|
__all__ = ["Mamba2Config"]
|