transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""PyTorch Pixio model."""
|
|
16
|
+
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
import torch
|
|
20
|
+
from torch import nn
|
|
21
|
+
|
|
22
|
+
from ...modeling_layers import GradientCheckpointingLayer
|
|
23
|
+
from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling
|
|
24
|
+
from ...utils import auto_docstring, is_tracing, logging
|
|
25
|
+
from ...utils.generic import check_model_inputs
|
|
26
|
+
from ..dinov2.configuration_dinov2 import Dinov2Config
|
|
27
|
+
from ..dinov2.modeling_dinov2 import (
|
|
28
|
+
Dinov2Backbone,
|
|
29
|
+
Dinov2DropPath,
|
|
30
|
+
Dinov2MLP,
|
|
31
|
+
)
|
|
32
|
+
from ..vit.modeling_vit import ViTAttention, ViTPatchEmbeddings, ViTPreTrainedModel
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
logger = logging.get_logger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class PixioConfig(Dinov2Config):
|
|
39
|
+
r"""
|
|
40
|
+
This is the configuration class to store the configuration of a [`PixioModel`]. It is used to instantiate a
|
|
41
|
+
Pixio model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
42
|
+
with the defaults will yield a similar configuration to that of the ViT
|
|
43
|
+
[facebook/pixio-huge](https://huggingface.co/facebook/pixio-huge) architecture.
|
|
44
|
+
|
|
45
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
46
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
hidden_size (`int`, *optional*, defaults to 1280):
|
|
50
|
+
Dimensionality of the encoder layers and the pooler layer.
|
|
51
|
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
|
52
|
+
Number of hidden layers in the Transformer encoder.
|
|
53
|
+
num_attention_heads (`int`, *optional*, defaults to 16):
|
|
54
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
|
55
|
+
mlp_ratio (`int`, *optional*, defaults to 4):
|
|
56
|
+
Ratio of the hidden size of the MLPs relative to the `hidden_size`.
|
|
57
|
+
n_cls_tokens (`int`, *optional*, defaults to 8):
|
|
58
|
+
Number of class tokens in the Transformer encoder.
|
|
59
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
|
60
|
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
|
61
|
+
`"relu"`, `"selu"` and `"gelu_new"` are supported.
|
|
62
|
+
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
|
|
63
|
+
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
|
64
|
+
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
|
|
65
|
+
The dropout ratio for the attention probabilities.
|
|
66
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
67
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
68
|
+
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
|
|
69
|
+
The epsilon used by the layer normalization layers.
|
|
70
|
+
image_size (`int`, *optional*, defaults to 256):
|
|
71
|
+
The size (resolution) of each image.
|
|
72
|
+
patch_size (`int`, *optional*, defaults to 16):
|
|
73
|
+
The size (resolution) of each patch.
|
|
74
|
+
num_channels (`int`, *optional*, defaults to 3):
|
|
75
|
+
The number of input channels.
|
|
76
|
+
qkv_bias (`bool`, *optional*, defaults to `True`):
|
|
77
|
+
Whether to add a bias to the queries, keys and values.
|
|
78
|
+
drop_path_rate (`float`, *optional*, defaults to 0.0):
|
|
79
|
+
Stochastic depth rate per sample (when applied in the main path of residual layers).
|
|
80
|
+
out_features (`list[str]`, *optional*):
|
|
81
|
+
If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
|
|
82
|
+
(depending on how many stages the model has). If unset and `out_indices` is set, will default to the
|
|
83
|
+
corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
|
|
84
|
+
same order as defined in the `stage_names` attribute.
|
|
85
|
+
out_indices (`list[int]`, *optional*):
|
|
86
|
+
If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
|
|
87
|
+
many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
|
|
88
|
+
If unset and `out_features` is unset, will default to the last stage. Must be in the
|
|
89
|
+
same order as defined in the `stage_names` attribute.
|
|
90
|
+
apply_layernorm (`bool`, *optional*, defaults to `True`):
|
|
91
|
+
Whether to apply layer normalization to the feature maps in case the model is used as backbone.
|
|
92
|
+
reshape_hidden_states (`bool`, *optional*, defaults to `True`):
|
|
93
|
+
Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
|
|
94
|
+
case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
|
|
95
|
+
seq_len, hidden_size)`.
|
|
96
|
+
|
|
97
|
+
Example:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
>>> from transformers import PixioConfig, PixioModel
|
|
101
|
+
|
|
102
|
+
>>> # Initializing a Pixio pixio-huge style configuration
|
|
103
|
+
>>> configuration = PixioConfig()
|
|
104
|
+
|
|
105
|
+
>>> # Initializing a model (with random weights) from the pixio-huge style configuration
|
|
106
|
+
>>> model = PixioModel(configuration)
|
|
107
|
+
|
|
108
|
+
>>> # Accessing the model configuration
|
|
109
|
+
>>> configuration = model.config
|
|
110
|
+
```"""
|
|
111
|
+
|
|
112
|
+
model_type = "pixio"
|
|
113
|
+
|
|
114
|
+
def __init__(
|
|
115
|
+
self,
|
|
116
|
+
hidden_size=1280,
|
|
117
|
+
num_hidden_layers=32,
|
|
118
|
+
num_attention_heads=16,
|
|
119
|
+
mlp_ratio=4,
|
|
120
|
+
n_cls_tokens=8,
|
|
121
|
+
hidden_act="gelu",
|
|
122
|
+
hidden_dropout_prob=0.0,
|
|
123
|
+
attention_probs_dropout_prob=0.0,
|
|
124
|
+
initializer_range=0.02,
|
|
125
|
+
layer_norm_eps=1e-6,
|
|
126
|
+
image_size=256,
|
|
127
|
+
patch_size=16,
|
|
128
|
+
num_channels=3,
|
|
129
|
+
qkv_bias=True,
|
|
130
|
+
drop_path_rate=0.0,
|
|
131
|
+
out_features=None,
|
|
132
|
+
out_indices=None,
|
|
133
|
+
apply_layernorm=True,
|
|
134
|
+
reshape_hidden_states=True,
|
|
135
|
+
**kwargs,
|
|
136
|
+
):
|
|
137
|
+
super().__init__(
|
|
138
|
+
hidden_size=hidden_size,
|
|
139
|
+
num_hidden_layers=num_hidden_layers,
|
|
140
|
+
num_attention_heads=num_attention_heads,
|
|
141
|
+
mlp_ratio=mlp_ratio,
|
|
142
|
+
hidden_act=hidden_act,
|
|
143
|
+
hidden_dropout_prob=hidden_dropout_prob,
|
|
144
|
+
attention_probs_dropout_prob=attention_probs_dropout_prob,
|
|
145
|
+
initializer_range=initializer_range,
|
|
146
|
+
layer_norm_eps=layer_norm_eps,
|
|
147
|
+
image_size=image_size,
|
|
148
|
+
patch_size=patch_size,
|
|
149
|
+
num_channels=num_channels,
|
|
150
|
+
qkv_bias=qkv_bias,
|
|
151
|
+
drop_path_rate=drop_path_rate,
|
|
152
|
+
apply_layernorm=apply_layernorm,
|
|
153
|
+
reshape_hidden_states=reshape_hidden_states,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
self.n_cls_tokens = n_cls_tokens
|
|
157
|
+
|
|
158
|
+
del self.layerscale_value
|
|
159
|
+
del self.use_swiglu_ffn
|
|
160
|
+
del self.use_mask_token
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class PixioPatchEmbeddings(ViTPatchEmbeddings):
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class PixioEmbeddings(nn.Module):
|
|
168
|
+
"""
|
|
169
|
+
Construct the CLS tokens, position and patch embeddings.
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
def __init__(self, config: PixioConfig) -> None:
|
|
173
|
+
super().__init__()
|
|
174
|
+
|
|
175
|
+
self.cls_token = nn.Parameter(torch.randn(1, config.n_cls_tokens, config.hidden_size))
|
|
176
|
+
self.mask_token = None
|
|
177
|
+
self.patch_embeddings = PixioPatchEmbeddings(config)
|
|
178
|
+
num_patches = self.patch_embeddings.num_patches
|
|
179
|
+
self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + config.n_cls_tokens, config.hidden_size))
|
|
180
|
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
181
|
+
self.n_cls_tokens = config.n_cls_tokens
|
|
182
|
+
self.patch_size = config.patch_size
|
|
183
|
+
self.config = config
|
|
184
|
+
|
|
185
|
+
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
|
|
186
|
+
"""
|
|
187
|
+
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
|
|
188
|
+
images. This method is also adapted to support tracing and interpolation at torch.float32 precision.
|
|
189
|
+
|
|
190
|
+
Adapted from:
|
|
191
|
+
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
|
|
192
|
+
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
|
|
193
|
+
"""
|
|
194
|
+
num_patches = embeddings.shape[1] - self.n_cls_tokens
|
|
195
|
+
num_positions = self.position_embeddings.shape[1] - self.n_cls_tokens
|
|
196
|
+
|
|
197
|
+
if not is_tracing() and num_patches == num_positions and height == width:
|
|
198
|
+
return self.position_embeddings
|
|
199
|
+
|
|
200
|
+
class_pos_embed = self.position_embeddings[:, : self.n_cls_tokens]
|
|
201
|
+
patch_pos_embed = self.position_embeddings[:, self.n_cls_tokens :]
|
|
202
|
+
|
|
203
|
+
dim = embeddings.shape[-1]
|
|
204
|
+
|
|
205
|
+
new_height = height // self.patch_size
|
|
206
|
+
new_width = width // self.patch_size
|
|
207
|
+
|
|
208
|
+
sqrt_num_positions = int(num_positions**0.5)
|
|
209
|
+
patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
|
|
210
|
+
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
|
|
211
|
+
target_dtype = patch_pos_embed.dtype
|
|
212
|
+
patch_pos_embed = nn.functional.interpolate(
|
|
213
|
+
patch_pos_embed.to(torch.float32),
|
|
214
|
+
size=(new_height, new_width),
|
|
215
|
+
mode="bicubic",
|
|
216
|
+
align_corners=False,
|
|
217
|
+
).to(dtype=target_dtype)
|
|
218
|
+
|
|
219
|
+
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
|
|
220
|
+
|
|
221
|
+
return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
|
|
222
|
+
|
|
223
|
+
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
|
|
224
|
+
batch_size, _, height, width = pixel_values.shape
|
|
225
|
+
target_dtype = self.patch_embeddings.projection.weight.dtype
|
|
226
|
+
embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
|
|
227
|
+
|
|
228
|
+
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
|
|
229
|
+
embeddings = torch.cat((cls_tokens, embeddings), dim=1)
|
|
230
|
+
|
|
231
|
+
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
|
|
232
|
+
|
|
233
|
+
embeddings = self.dropout(embeddings)
|
|
234
|
+
|
|
235
|
+
return embeddings
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class PixioAttention(ViTAttention):
|
|
239
|
+
pass
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class PixioDropPath(Dinov2DropPath):
|
|
243
|
+
pass
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class PixioMLP(Dinov2MLP):
|
|
247
|
+
pass
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class PixioLayer(GradientCheckpointingLayer):
|
|
251
|
+
def __init__(self, config: PixioConfig) -> None:
|
|
252
|
+
super().__init__()
|
|
253
|
+
|
|
254
|
+
self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
255
|
+
self.attention = PixioAttention(config)
|
|
256
|
+
self.drop_path = PixioDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
|
|
257
|
+
|
|
258
|
+
self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
259
|
+
self.mlp = PixioMLP(config)
|
|
260
|
+
|
|
261
|
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
|
262
|
+
hidden_states_norm = self.norm1(hidden_states)
|
|
263
|
+
self_attention_output = self.attention(hidden_states_norm)
|
|
264
|
+
|
|
265
|
+
hidden_states = self.drop_path(self_attention_output) + hidden_states
|
|
266
|
+
|
|
267
|
+
layer_output = self.norm2(hidden_states)
|
|
268
|
+
layer_output = self.mlp(layer_output)
|
|
269
|
+
|
|
270
|
+
layer_output = self.drop_path(layer_output) + hidden_states
|
|
271
|
+
|
|
272
|
+
return layer_output
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
class PixioEncoder(nn.Module):
|
|
276
|
+
def __init__(self, config: PixioConfig):
|
|
277
|
+
super().__init__()
|
|
278
|
+
self.config = config
|
|
279
|
+
self.layer = nn.ModuleList([PixioLayer(config) for _ in range(config.num_hidden_layers)])
|
|
280
|
+
self.gradient_checkpointing = False
|
|
281
|
+
|
|
282
|
+
def forward(self, hidden_states: torch.Tensor, output_hidden_states: bool = False) -> BaseModelOutput:
|
|
283
|
+
all_hidden_states = [hidden_states] if output_hidden_states else None
|
|
284
|
+
for i, layer_module in enumerate(self.layer):
|
|
285
|
+
hidden_states = layer_module(hidden_states)
|
|
286
|
+
if all_hidden_states:
|
|
287
|
+
all_hidden_states.append(hidden_states)
|
|
288
|
+
|
|
289
|
+
return BaseModelOutput(
|
|
290
|
+
last_hidden_state=hidden_states,
|
|
291
|
+
hidden_states=tuple(all_hidden_states) if all_hidden_states else None,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class PixioPreTrainedModel(ViTPreTrainedModel):
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
@auto_docstring
|
|
300
|
+
class PixioModel(PixioPreTrainedModel):
|
|
301
|
+
def __init__(self, config: PixioConfig):
|
|
302
|
+
super().__init__(config)
|
|
303
|
+
self.config = config
|
|
304
|
+
|
|
305
|
+
self.embeddings = PixioEmbeddings(config)
|
|
306
|
+
self.encoder = PixioEncoder(config)
|
|
307
|
+
|
|
308
|
+
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
309
|
+
|
|
310
|
+
self.post_init()
|
|
311
|
+
|
|
312
|
+
def get_input_embeddings(self) -> PixioPatchEmbeddings:
|
|
313
|
+
return self.embeddings.patch_embeddings
|
|
314
|
+
|
|
315
|
+
@check_model_inputs(tie_last_hidden_states=False)
|
|
316
|
+
@auto_docstring
|
|
317
|
+
def forward(
|
|
318
|
+
self,
|
|
319
|
+
pixel_values: Optional[torch.Tensor] = None,
|
|
320
|
+
output_hidden_states: Optional[bool] = None,
|
|
321
|
+
**kwargs,
|
|
322
|
+
) -> BaseModelOutputWithPooling:
|
|
323
|
+
if output_hidden_states is None:
|
|
324
|
+
output_hidden_states = self.config.output_hidden_states
|
|
325
|
+
|
|
326
|
+
if pixel_values is None:
|
|
327
|
+
raise ValueError("You have to specify pixel_values")
|
|
328
|
+
|
|
329
|
+
embedding_output = self.embeddings(pixel_values)
|
|
330
|
+
|
|
331
|
+
encoder_outputs: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=output_hidden_states)
|
|
332
|
+
sequence_output = encoder_outputs.last_hidden_state
|
|
333
|
+
sequence_output = self.layernorm(sequence_output)
|
|
334
|
+
pooled_output = sequence_output[:, : self.embeddings.n_cls_tokens, :].mean(dim=1)
|
|
335
|
+
|
|
336
|
+
return BaseModelOutputWithPooling(
|
|
337
|
+
last_hidden_state=sequence_output,
|
|
338
|
+
pooler_output=pooled_output,
|
|
339
|
+
hidden_states=encoder_outputs.hidden_states,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
@auto_docstring(
|
|
344
|
+
custom_intro="""
|
|
345
|
+
Pixio backbone, to be used with frameworks like DETR and MaskFormer.
|
|
346
|
+
"""
|
|
347
|
+
)
|
|
348
|
+
class PixioBackbone(Dinov2Backbone):
|
|
349
|
+
@check_model_inputs
|
|
350
|
+
@auto_docstring
|
|
351
|
+
def forward(
|
|
352
|
+
self, pixel_values: torch.Tensor, output_hidden_states: Optional[bool] = None, **kwargs
|
|
353
|
+
) -> BackboneOutput:
|
|
354
|
+
r"""
|
|
355
|
+
Examples:
|
|
356
|
+
|
|
357
|
+
```python
|
|
358
|
+
>>> from transformers import AutoImageProcessor, AutoBackbone
|
|
359
|
+
>>> import torch
|
|
360
|
+
>>> from PIL import Image
|
|
361
|
+
>>> import requests
|
|
362
|
+
|
|
363
|
+
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
364
|
+
>>> image = Image.open(requests.get(url, stream=True).raw)
|
|
365
|
+
|
|
366
|
+
>>> processor = AutoImageProcessor.from_pretrained("facebook/pixio-huge")
|
|
367
|
+
>>> model = AutoBackbone.from_pretrained(
|
|
368
|
+
... "facebook/pixio-huge", out_features=["stage7", "stage15", "stage23", "stage31"]
|
|
369
|
+
... )
|
|
370
|
+
|
|
371
|
+
>>> inputs = processor(image, return_tensors="pt")
|
|
372
|
+
|
|
373
|
+
>>> outputs = model(**inputs)
|
|
374
|
+
>>> feature_maps = outputs.feature_maps
|
|
375
|
+
>>> list(feature_maps[-1].shape)
|
|
376
|
+
[1, 1280, 16, 16]
|
|
377
|
+
```"""
|
|
378
|
+
if output_hidden_states is None:
|
|
379
|
+
output_hidden_states = self.config.output_hidden_states
|
|
380
|
+
|
|
381
|
+
embedding_output = self.embeddings(pixel_values)
|
|
382
|
+
output: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=True)
|
|
383
|
+
hidden_states = output.hidden_states
|
|
384
|
+
|
|
385
|
+
feature_maps = []
|
|
386
|
+
for stage, hidden_state in zip(self.stage_names, hidden_states):
|
|
387
|
+
if stage in self.out_features:
|
|
388
|
+
if self.config.apply_layernorm:
|
|
389
|
+
hidden_state = self.layernorm(hidden_state)
|
|
390
|
+
if self.config.reshape_hidden_states:
|
|
391
|
+
hidden_state = hidden_state[:, self.embeddings.n_cls_tokens :]
|
|
392
|
+
batch_size, _, height, width = pixel_values.shape
|
|
393
|
+
patch_size = self.config.patch_size
|
|
394
|
+
hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
|
|
395
|
+
hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
|
|
396
|
+
feature_maps.append(hidden_state)
|
|
397
|
+
|
|
398
|
+
return BackboneOutput(
|
|
399
|
+
feature_maps=tuple(feature_maps),
|
|
400
|
+
hidden_states=hidden_states if output_hidden_states else None,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
__all__ = ["PixioConfig", "PixioModel", "PixioPreTrainedModel", "PixioBackbone"]
|
|
@@ -74,7 +74,7 @@ class PixtralRotaryEmbedding(nn.Module):
|
|
|
74
74
|
|
|
75
75
|
inv_freq, attention_scaling = rope_init_fn(self.config, device)
|
|
76
76
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
77
|
-
self.original_inv_freq =
|
|
77
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
78
78
|
|
|
79
79
|
@staticmethod
|
|
80
80
|
def compute_default_rope_parameters(
|
|
@@ -150,7 +150,7 @@ class PixtralProcessor(ProcessorMixin):
|
|
|
150
150
|
|
|
151
151
|
output_kwargs = self._merge_kwargs(
|
|
152
152
|
PixtralProcessorKwargs,
|
|
153
|
-
tokenizer_init_kwargs=self.tokenizer
|
|
153
|
+
tokenizer_init_kwargs=getattr(self.tokenizer, "init_kwargs", {}),
|
|
154
154
|
**kwargs,
|
|
155
155
|
)
|
|
156
156
|
|
|
@@ -197,6 +197,8 @@ class PixtralProcessor(ProcessorMixin):
|
|
|
197
197
|
|
|
198
198
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
|
199
199
|
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
|
|
200
|
+
# Remove return_token_type_ids as MistralCommonBackend doesn't support it
|
|
201
|
+
output_kwargs["text_kwargs"].pop("return_token_type_ids", None)
|
|
200
202
|
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None)
|
|
201
203
|
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
|
|
202
204
|
|
|
@@ -151,6 +151,7 @@ class PLBartConfig(PreTrainedConfig):
|
|
|
151
151
|
self.use_cache = use_cache
|
|
152
152
|
self.num_hidden_layers = encoder_layers
|
|
153
153
|
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
|
154
|
+
|
|
154
155
|
super().__init__(
|
|
155
156
|
pad_token_id=pad_token_id,
|
|
156
157
|
bos_token_id=bos_token_id,
|
|
@@ -27,6 +27,7 @@ import torch
|
|
|
27
27
|
from torch import nn
|
|
28
28
|
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
29
29
|
|
|
30
|
+
from ... import initialization as init
|
|
30
31
|
from ...activations import ACT2FN
|
|
31
32
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
32
33
|
from ...generation import GenerationMixin
|
|
@@ -73,6 +74,11 @@ class PLBartPreTrainedModel(PreTrainedModel):
|
|
|
73
74
|
_supports_sdpa = True
|
|
74
75
|
_supports_flex_attn = True
|
|
75
76
|
|
|
77
|
+
def _init_weights(self, module):
|
|
78
|
+
super()._init_weights(module)
|
|
79
|
+
if isinstance(module, PLBartForConditionalGeneration):
|
|
80
|
+
init.zeros_(module.final_logits_bias)
|
|
81
|
+
|
|
76
82
|
|
|
77
83
|
class PLBartLearnedPositionalEmbedding(nn.Embedding):
|
|
78
84
|
"""
|
|
@@ -1273,6 +1279,7 @@ class PLBartDecoderWrapper(PLBartPreTrainedModel):
|
|
|
1273
1279
|
def __init__(self, config):
|
|
1274
1280
|
super().__init__(config)
|
|
1275
1281
|
self.decoder = PLBartDecoder(config)
|
|
1282
|
+
self.post_init()
|
|
1276
1283
|
|
|
1277
1284
|
def forward(self, *args, **kwargs):
|
|
1278
1285
|
return self.decoder(*args, **kwargs)
|
|
@@ -21,6 +21,7 @@ import torch
|
|
|
21
21
|
from torch import nn
|
|
22
22
|
from torch.nn import CrossEntropyLoss
|
|
23
23
|
|
|
24
|
+
from ... import initialization as init
|
|
24
25
|
from ...cache_utils import Cache
|
|
25
26
|
from ...generation import GenerationMixin
|
|
26
27
|
from ...modeling_outputs import (
|
|
@@ -56,6 +57,11 @@ class PLBartPreTrainedModel(PreTrainedModel):
|
|
|
56
57
|
_supports_sdpa = True
|
|
57
58
|
_supports_flex_attn = True
|
|
58
59
|
|
|
60
|
+
def _init_weights(self, module):
|
|
61
|
+
super()._init_weights(module)
|
|
62
|
+
if isinstance(module, PLBartForConditionalGeneration):
|
|
63
|
+
init.zeros_(module.final_logits_bias)
|
|
64
|
+
|
|
59
65
|
|
|
60
66
|
class PLBartEncoder(BartEncoder):
|
|
61
67
|
pass
|
|
@@ -231,7 +231,6 @@ class PoolFormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
231
231
|
processed_images_grouped[shape] = stacked_images
|
|
232
232
|
|
|
233
233
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
234
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
235
234
|
|
|
236
235
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
237
236
|
|
|
@@ -268,7 +268,11 @@ class PoolFormerModel(PoolFormerPreTrainedModel):
|
|
|
268
268
|
self.post_init()
|
|
269
269
|
|
|
270
270
|
def get_input_embeddings(self):
|
|
271
|
-
|
|
271
|
+
# Input embeddings correspond to the very first patch-embedding stage.
|
|
272
|
+
return self.encoder.patch_embeddings[0]
|
|
273
|
+
|
|
274
|
+
def set_input_embeddings(self, value):
|
|
275
|
+
self.encoder.patch_embeddings[0] = value
|
|
272
276
|
|
|
273
277
|
@auto_docstring
|
|
274
278
|
def forward(
|
|
@@ -333,6 +337,12 @@ class PoolFormerForImageClassification(PoolFormerPreTrainedModel):
|
|
|
333
337
|
# Initialize weights and apply final processing
|
|
334
338
|
self.post_init()
|
|
335
339
|
|
|
340
|
+
def get_input_embeddings(self):
|
|
341
|
+
return self.poolformer.get_input_embeddings()
|
|
342
|
+
|
|
343
|
+
def set_input_embeddings(self, value):
|
|
344
|
+
self.poolformer.set_input_embeddings(value)
|
|
345
|
+
|
|
336
346
|
@auto_docstring
|
|
337
347
|
def forward(
|
|
338
348
|
self,
|
|
@@ -37,9 +37,8 @@ class PromptDepthAnythingConfig(PreTrainedConfig):
|
|
|
37
37
|
documentation from [`PreTrainedConfig`] for more information.
|
|
38
38
|
|
|
39
39
|
Args:
|
|
40
|
-
backbone_config (`Union[dict
|
|
41
|
-
The configuration of the backbone model.
|
|
42
|
-
leverage the [`AutoBackbone`] API.
|
|
40
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `Dinov2Config()`):
|
|
41
|
+
The configuration of the backbone model.
|
|
43
42
|
backbone (`str`, *optional*):
|
|
44
43
|
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
45
44
|
will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
|
|
@@ -1848,6 +1848,7 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel, GenerationMixin):
|
|
|
1848
1848
|
past_key_values=None,
|
|
1849
1849
|
attention_mask=None,
|
|
1850
1850
|
use_cache=None,
|
|
1851
|
+
is_first_iteration=False,
|
|
1851
1852
|
**kwargs,
|
|
1852
1853
|
):
|
|
1853
1854
|
# Overwritten -- our tests complain if we use GenerationMixin.prepare_inputs_for_generation
|
|
@@ -1856,7 +1857,7 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel, GenerationMixin):
|
|
|
1856
1857
|
if attention_mask is None:
|
|
1857
1858
|
attention_mask = input_ids.new_ones(input_ids.shape)
|
|
1858
1859
|
|
|
1859
|
-
if past_key_values is not None and
|
|
1860
|
+
if past_key_values is not None and not is_first_iteration:
|
|
1860
1861
|
input_ids = input_ids[:, -1:]
|
|
1861
1862
|
# first step, decoder_cached_states are empty
|
|
1862
1863
|
model_inputs = {
|
|
@@ -64,7 +64,7 @@ class Qwen2RotaryEmbedding(nn.Module):
|
|
|
64
64
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
65
65
|
|
|
66
66
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
67
|
-
self.original_inv_freq =
|
|
67
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
68
68
|
|
|
69
69
|
@staticmethod
|
|
70
70
|
def compute_default_rope_parameters(
|