transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""PyTorch Pixio model."""
|
|
16
|
+
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
import torch
|
|
20
|
+
from torch import nn
|
|
21
|
+
|
|
22
|
+
from ...modeling_layers import GradientCheckpointingLayer
|
|
23
|
+
from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling
|
|
24
|
+
from ...utils import auto_docstring, is_tracing, logging
|
|
25
|
+
from ...utils.generic import check_model_inputs
|
|
26
|
+
from ..dinov2.configuration_dinov2 import Dinov2Config
|
|
27
|
+
from ..dinov2.modeling_dinov2 import (
|
|
28
|
+
Dinov2Backbone,
|
|
29
|
+
Dinov2DropPath,
|
|
30
|
+
Dinov2MLP,
|
|
31
|
+
)
|
|
32
|
+
from ..vit.modeling_vit import ViTAttention, ViTPatchEmbeddings, ViTPreTrainedModel
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
logger = logging.get_logger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class PixioConfig(Dinov2Config):
|
|
39
|
+
r"""
|
|
40
|
+
This is the configuration class to store the configuration of a [`PixioModel`]. It is used to instantiate a
|
|
41
|
+
Pixio model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
42
|
+
with the defaults will yield a similar configuration to that of the ViT
|
|
43
|
+
[facebook/pixio-huge](https://huggingface.co/facebook/pixio-huge) architecture.
|
|
44
|
+
|
|
45
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
46
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
hidden_size (`int`, *optional*, defaults to 1280):
|
|
50
|
+
Dimensionality of the encoder layers and the pooler layer.
|
|
51
|
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
|
52
|
+
Number of hidden layers in the Transformer encoder.
|
|
53
|
+
num_attention_heads (`int`, *optional*, defaults to 16):
|
|
54
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
|
55
|
+
mlp_ratio (`int`, *optional*, defaults to 4):
|
|
56
|
+
Ratio of the hidden size of the MLPs relative to the `hidden_size`.
|
|
57
|
+
n_cls_tokens (`int`, *optional*, defaults to 8):
|
|
58
|
+
Number of class tokens in the Transformer encoder.
|
|
59
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
|
60
|
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
|
61
|
+
`"relu"`, `"selu"` and `"gelu_new"` are supported.
|
|
62
|
+
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
|
|
63
|
+
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
|
64
|
+
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
|
|
65
|
+
The dropout ratio for the attention probabilities.
|
|
66
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
67
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
68
|
+
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
|
|
69
|
+
The epsilon used by the layer normalization layers.
|
|
70
|
+
image_size (`int`, *optional*, defaults to 256):
|
|
71
|
+
The size (resolution) of each image.
|
|
72
|
+
patch_size (`int`, *optional*, defaults to 16):
|
|
73
|
+
The size (resolution) of each patch.
|
|
74
|
+
num_channels (`int`, *optional*, defaults to 3):
|
|
75
|
+
The number of input channels.
|
|
76
|
+
qkv_bias (`bool`, *optional*, defaults to `True`):
|
|
77
|
+
Whether to add a bias to the queries, keys and values.
|
|
78
|
+
drop_path_rate (`float`, *optional*, defaults to 0.0):
|
|
79
|
+
Stochastic depth rate per sample (when applied in the main path of residual layers).
|
|
80
|
+
out_features (`list[str]`, *optional*):
|
|
81
|
+
If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
|
|
82
|
+
(depending on how many stages the model has). If unset and `out_indices` is set, will default to the
|
|
83
|
+
corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
|
|
84
|
+
same order as defined in the `stage_names` attribute.
|
|
85
|
+
out_indices (`list[int]`, *optional*):
|
|
86
|
+
If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
|
|
87
|
+
many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
|
|
88
|
+
If unset and `out_features` is unset, will default to the last stage. Must be in the
|
|
89
|
+
same order as defined in the `stage_names` attribute.
|
|
90
|
+
apply_layernorm (`bool`, *optional*, defaults to `True`):
|
|
91
|
+
Whether to apply layer normalization to the feature maps in case the model is used as backbone.
|
|
92
|
+
reshape_hidden_states (`bool`, *optional*, defaults to `True`):
|
|
93
|
+
Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
|
|
94
|
+
case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
|
|
95
|
+
seq_len, hidden_size)`.
|
|
96
|
+
|
|
97
|
+
Example:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
>>> from transformers import PixioConfig, PixioModel
|
|
101
|
+
|
|
102
|
+
>>> # Initializing a Pixio pixio-huge style configuration
|
|
103
|
+
>>> configuration = PixioConfig()
|
|
104
|
+
|
|
105
|
+
>>> # Initializing a model (with random weights) from the pixio-huge style configuration
|
|
106
|
+
>>> model = PixioModel(configuration)
|
|
107
|
+
|
|
108
|
+
>>> # Accessing the model configuration
|
|
109
|
+
>>> configuration = model.config
|
|
110
|
+
```"""
|
|
111
|
+
|
|
112
|
+
model_type = "pixio"
|
|
113
|
+
|
|
114
|
+
def __init__(
|
|
115
|
+
self,
|
|
116
|
+
hidden_size=1280,
|
|
117
|
+
num_hidden_layers=32,
|
|
118
|
+
num_attention_heads=16,
|
|
119
|
+
mlp_ratio=4,
|
|
120
|
+
n_cls_tokens=8,
|
|
121
|
+
hidden_act="gelu",
|
|
122
|
+
hidden_dropout_prob=0.0,
|
|
123
|
+
attention_probs_dropout_prob=0.0,
|
|
124
|
+
initializer_range=0.02,
|
|
125
|
+
layer_norm_eps=1e-6,
|
|
126
|
+
image_size=256,
|
|
127
|
+
patch_size=16,
|
|
128
|
+
num_channels=3,
|
|
129
|
+
qkv_bias=True,
|
|
130
|
+
drop_path_rate=0.0,
|
|
131
|
+
out_features=None,
|
|
132
|
+
out_indices=None,
|
|
133
|
+
apply_layernorm=True,
|
|
134
|
+
reshape_hidden_states=True,
|
|
135
|
+
**kwargs,
|
|
136
|
+
):
|
|
137
|
+
super().__init__(
|
|
138
|
+
hidden_size=hidden_size,
|
|
139
|
+
num_hidden_layers=num_hidden_layers,
|
|
140
|
+
num_attention_heads=num_attention_heads,
|
|
141
|
+
mlp_ratio=mlp_ratio,
|
|
142
|
+
hidden_act=hidden_act,
|
|
143
|
+
hidden_dropout_prob=hidden_dropout_prob,
|
|
144
|
+
attention_probs_dropout_prob=attention_probs_dropout_prob,
|
|
145
|
+
initializer_range=initializer_range,
|
|
146
|
+
layer_norm_eps=layer_norm_eps,
|
|
147
|
+
image_size=image_size,
|
|
148
|
+
patch_size=patch_size,
|
|
149
|
+
num_channels=num_channels,
|
|
150
|
+
qkv_bias=qkv_bias,
|
|
151
|
+
drop_path_rate=drop_path_rate,
|
|
152
|
+
apply_layernorm=apply_layernorm,
|
|
153
|
+
reshape_hidden_states=reshape_hidden_states,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
self.n_cls_tokens = n_cls_tokens
|
|
157
|
+
|
|
158
|
+
del self.layerscale_value
|
|
159
|
+
del self.use_swiglu_ffn
|
|
160
|
+
del self.use_mask_token
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class PixioPatchEmbeddings(ViTPatchEmbeddings):
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class PixioEmbeddings(nn.Module):
|
|
168
|
+
"""
|
|
169
|
+
Construct the CLS tokens, position and patch embeddings.
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
def __init__(self, config: PixioConfig) -> None:
|
|
173
|
+
super().__init__()
|
|
174
|
+
|
|
175
|
+
self.cls_token = nn.Parameter(torch.randn(1, config.n_cls_tokens, config.hidden_size))
|
|
176
|
+
self.mask_token = None
|
|
177
|
+
self.patch_embeddings = PixioPatchEmbeddings(config)
|
|
178
|
+
num_patches = self.patch_embeddings.num_patches
|
|
179
|
+
self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + config.n_cls_tokens, config.hidden_size))
|
|
180
|
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
181
|
+
self.n_cls_tokens = config.n_cls_tokens
|
|
182
|
+
self.patch_size = config.patch_size
|
|
183
|
+
self.config = config
|
|
184
|
+
|
|
185
|
+
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
|
|
186
|
+
"""
|
|
187
|
+
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
|
|
188
|
+
images. This method is also adapted to support tracing and interpolation at torch.float32 precision.
|
|
189
|
+
|
|
190
|
+
Adapted from:
|
|
191
|
+
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
|
|
192
|
+
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
|
|
193
|
+
"""
|
|
194
|
+
num_patches = embeddings.shape[1] - self.n_cls_tokens
|
|
195
|
+
num_positions = self.position_embeddings.shape[1] - self.n_cls_tokens
|
|
196
|
+
|
|
197
|
+
if not is_tracing() and num_patches == num_positions and height == width:
|
|
198
|
+
return self.position_embeddings
|
|
199
|
+
|
|
200
|
+
class_pos_embed = self.position_embeddings[:, : self.n_cls_tokens]
|
|
201
|
+
patch_pos_embed = self.position_embeddings[:, self.n_cls_tokens :]
|
|
202
|
+
|
|
203
|
+
dim = embeddings.shape[-1]
|
|
204
|
+
|
|
205
|
+
new_height = height // self.patch_size
|
|
206
|
+
new_width = width // self.patch_size
|
|
207
|
+
|
|
208
|
+
sqrt_num_positions = int(num_positions**0.5)
|
|
209
|
+
patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
|
|
210
|
+
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
|
|
211
|
+
target_dtype = patch_pos_embed.dtype
|
|
212
|
+
patch_pos_embed = nn.functional.interpolate(
|
|
213
|
+
patch_pos_embed.to(torch.float32),
|
|
214
|
+
size=(new_height, new_width),
|
|
215
|
+
mode="bicubic",
|
|
216
|
+
align_corners=False,
|
|
217
|
+
).to(dtype=target_dtype)
|
|
218
|
+
|
|
219
|
+
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
|
|
220
|
+
|
|
221
|
+
return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
|
|
222
|
+
|
|
223
|
+
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
|
|
224
|
+
batch_size, _, height, width = pixel_values.shape
|
|
225
|
+
target_dtype = self.patch_embeddings.projection.weight.dtype
|
|
226
|
+
embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
|
|
227
|
+
|
|
228
|
+
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
|
|
229
|
+
embeddings = torch.cat((cls_tokens, embeddings), dim=1)
|
|
230
|
+
|
|
231
|
+
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
|
|
232
|
+
|
|
233
|
+
embeddings = self.dropout(embeddings)
|
|
234
|
+
|
|
235
|
+
return embeddings
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class PixioAttention(ViTAttention):
|
|
239
|
+
pass
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class PixioDropPath(Dinov2DropPath):
|
|
243
|
+
pass
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class PixioMLP(Dinov2MLP):
|
|
247
|
+
pass
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class PixioLayer(GradientCheckpointingLayer):
|
|
251
|
+
def __init__(self, config: PixioConfig) -> None:
|
|
252
|
+
super().__init__()
|
|
253
|
+
|
|
254
|
+
self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
255
|
+
self.attention = PixioAttention(config)
|
|
256
|
+
self.drop_path = PixioDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
|
|
257
|
+
|
|
258
|
+
self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
259
|
+
self.mlp = PixioMLP(config)
|
|
260
|
+
|
|
261
|
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
|
262
|
+
hidden_states_norm = self.norm1(hidden_states)
|
|
263
|
+
self_attention_output = self.attention(hidden_states_norm)
|
|
264
|
+
|
|
265
|
+
hidden_states = self.drop_path(self_attention_output) + hidden_states
|
|
266
|
+
|
|
267
|
+
layer_output = self.norm2(hidden_states)
|
|
268
|
+
layer_output = self.mlp(layer_output)
|
|
269
|
+
|
|
270
|
+
layer_output = self.drop_path(layer_output) + hidden_states
|
|
271
|
+
|
|
272
|
+
return layer_output
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
class PixioEncoder(nn.Module):
|
|
276
|
+
def __init__(self, config: PixioConfig):
|
|
277
|
+
super().__init__()
|
|
278
|
+
self.config = config
|
|
279
|
+
self.layer = nn.ModuleList([PixioLayer(config) for _ in range(config.num_hidden_layers)])
|
|
280
|
+
self.gradient_checkpointing = False
|
|
281
|
+
|
|
282
|
+
def forward(self, hidden_states: torch.Tensor, output_hidden_states: bool = False) -> BaseModelOutput:
|
|
283
|
+
all_hidden_states = [hidden_states] if output_hidden_states else None
|
|
284
|
+
for i, layer_module in enumerate(self.layer):
|
|
285
|
+
hidden_states = layer_module(hidden_states)
|
|
286
|
+
if all_hidden_states:
|
|
287
|
+
all_hidden_states.append(hidden_states)
|
|
288
|
+
|
|
289
|
+
return BaseModelOutput(
|
|
290
|
+
last_hidden_state=hidden_states,
|
|
291
|
+
hidden_states=tuple(all_hidden_states) if all_hidden_states else None,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class PixioPreTrainedModel(ViTPreTrainedModel):
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
@auto_docstring
|
|
300
|
+
class PixioModel(PixioPreTrainedModel):
|
|
301
|
+
def __init__(self, config: PixioConfig):
|
|
302
|
+
super().__init__(config)
|
|
303
|
+
self.config = config
|
|
304
|
+
|
|
305
|
+
self.embeddings = PixioEmbeddings(config)
|
|
306
|
+
self.encoder = PixioEncoder(config)
|
|
307
|
+
|
|
308
|
+
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
309
|
+
|
|
310
|
+
self.post_init()
|
|
311
|
+
|
|
312
|
+
def get_input_embeddings(self) -> PixioPatchEmbeddings:
|
|
313
|
+
return self.embeddings.patch_embeddings
|
|
314
|
+
|
|
315
|
+
@check_model_inputs(tie_last_hidden_states=False)
|
|
316
|
+
@auto_docstring
|
|
317
|
+
def forward(
|
|
318
|
+
self,
|
|
319
|
+
pixel_values: Optional[torch.Tensor] = None,
|
|
320
|
+
output_hidden_states: Optional[bool] = None,
|
|
321
|
+
**kwargs,
|
|
322
|
+
) -> BaseModelOutputWithPooling:
|
|
323
|
+
if output_hidden_states is None:
|
|
324
|
+
output_hidden_states = self.config.output_hidden_states
|
|
325
|
+
|
|
326
|
+
if pixel_values is None:
|
|
327
|
+
raise ValueError("You have to specify pixel_values")
|
|
328
|
+
|
|
329
|
+
embedding_output = self.embeddings(pixel_values)
|
|
330
|
+
|
|
331
|
+
encoder_outputs: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=output_hidden_states)
|
|
332
|
+
sequence_output = encoder_outputs.last_hidden_state
|
|
333
|
+
sequence_output = self.layernorm(sequence_output)
|
|
334
|
+
pooled_output = sequence_output[:, : self.embeddings.n_cls_tokens, :].mean(dim=1)
|
|
335
|
+
|
|
336
|
+
return BaseModelOutputWithPooling(
|
|
337
|
+
last_hidden_state=sequence_output,
|
|
338
|
+
pooler_output=pooled_output,
|
|
339
|
+
hidden_states=encoder_outputs.hidden_states,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
@auto_docstring(
|
|
344
|
+
custom_intro="""
|
|
345
|
+
Pixio backbone, to be used with frameworks like DETR and MaskFormer.
|
|
346
|
+
"""
|
|
347
|
+
)
|
|
348
|
+
class PixioBackbone(Dinov2Backbone):
|
|
349
|
+
@check_model_inputs
|
|
350
|
+
@auto_docstring
|
|
351
|
+
def forward(
|
|
352
|
+
self, pixel_values: torch.Tensor, output_hidden_states: Optional[bool] = None, **kwargs
|
|
353
|
+
) -> BackboneOutput:
|
|
354
|
+
r"""
|
|
355
|
+
Examples:
|
|
356
|
+
|
|
357
|
+
```python
|
|
358
|
+
>>> from transformers import AutoImageProcessor, AutoBackbone
|
|
359
|
+
>>> import torch
|
|
360
|
+
>>> from PIL import Image
|
|
361
|
+
>>> import requests
|
|
362
|
+
|
|
363
|
+
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
|
364
|
+
>>> image = Image.open(requests.get(url, stream=True).raw)
|
|
365
|
+
|
|
366
|
+
>>> processor = AutoImageProcessor.from_pretrained("facebook/pixio-huge")
|
|
367
|
+
>>> model = AutoBackbone.from_pretrained(
|
|
368
|
+
... "facebook/pixio-huge", out_features=["stage7", "stage15", "stage23", "stage31"]
|
|
369
|
+
... )
|
|
370
|
+
|
|
371
|
+
>>> inputs = processor(image, return_tensors="pt")
|
|
372
|
+
|
|
373
|
+
>>> outputs = model(**inputs)
|
|
374
|
+
>>> feature_maps = outputs.feature_maps
|
|
375
|
+
>>> list(feature_maps[-1].shape)
|
|
376
|
+
[1, 1280, 16, 16]
|
|
377
|
+
```"""
|
|
378
|
+
if output_hidden_states is None:
|
|
379
|
+
output_hidden_states = self.config.output_hidden_states
|
|
380
|
+
|
|
381
|
+
embedding_output = self.embeddings(pixel_values)
|
|
382
|
+
output: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=True)
|
|
383
|
+
hidden_states = output.hidden_states
|
|
384
|
+
|
|
385
|
+
feature_maps = []
|
|
386
|
+
for stage, hidden_state in zip(self.stage_names, hidden_states):
|
|
387
|
+
if stage in self.out_features:
|
|
388
|
+
if self.config.apply_layernorm:
|
|
389
|
+
hidden_state = self.layernorm(hidden_state)
|
|
390
|
+
if self.config.reshape_hidden_states:
|
|
391
|
+
hidden_state = hidden_state[:, self.embeddings.n_cls_tokens :]
|
|
392
|
+
batch_size, _, height, width = pixel_values.shape
|
|
393
|
+
patch_size = self.config.patch_size
|
|
394
|
+
hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
|
|
395
|
+
hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
|
|
396
|
+
feature_maps.append(hidden_state)
|
|
397
|
+
|
|
398
|
+
return BackboneOutput(
|
|
399
|
+
feature_maps=tuple(feature_maps),
|
|
400
|
+
hidden_states=hidden_states if output_hidden_states else None,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
__all__ = ["PixioConfig", "PixioModel", "PixioPreTrainedModel", "PixioBackbone"]
|
|
@@ -28,6 +28,7 @@ from ...modeling_rope_utils import dynamic_rope_update
|
|
|
28
28
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
29
29
|
from ...processing_utils import Unpack
|
|
30
30
|
from ...utils import auto_docstring, can_return_tuple, logging
|
|
31
|
+
from ...utils.generic import maybe_autocast
|
|
31
32
|
from .configuration_pixtral import PixtralVisionConfig
|
|
32
33
|
|
|
33
34
|
|
|
@@ -73,7 +74,7 @@ class PixtralRotaryEmbedding(nn.Module):
|
|
|
73
74
|
|
|
74
75
|
inv_freq, attention_scaling = rope_init_fn(self.config, device)
|
|
75
76
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
76
|
-
self.original_inv_freq =
|
|
77
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
77
78
|
|
|
78
79
|
@staticmethod
|
|
79
80
|
def compute_default_rope_parameters(
|
|
@@ -125,7 +126,7 @@ class PixtralRotaryEmbedding(nn.Module):
|
|
|
125
126
|
def forward(self, x, position_ids):
|
|
126
127
|
freqs = self.inv_freq[position_ids]
|
|
127
128
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
128
|
-
with
|
|
129
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
129
130
|
emb = freqs
|
|
130
131
|
cos = emb.cos()
|
|
131
132
|
sin = emb.sin()
|
|
@@ -150,7 +150,7 @@ class PixtralProcessor(ProcessorMixin):
|
|
|
150
150
|
|
|
151
151
|
output_kwargs = self._merge_kwargs(
|
|
152
152
|
PixtralProcessorKwargs,
|
|
153
|
-
tokenizer_init_kwargs=self.tokenizer
|
|
153
|
+
tokenizer_init_kwargs=getattr(self.tokenizer, "init_kwargs", {}),
|
|
154
154
|
**kwargs,
|
|
155
155
|
)
|
|
156
156
|
|
|
@@ -197,6 +197,8 @@ class PixtralProcessor(ProcessorMixin):
|
|
|
197
197
|
|
|
198
198
|
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
|
199
199
|
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
|
|
200
|
+
# Remove return_token_type_ids as MistralCommonBackend doesn't support it
|
|
201
|
+
output_kwargs["text_kwargs"].pop("return_token_type_ids", None)
|
|
200
202
|
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None)
|
|
201
203
|
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
|
|
202
204
|
|
|
@@ -151,6 +151,7 @@ class PLBartConfig(PreTrainedConfig):
|
|
|
151
151
|
self.use_cache = use_cache
|
|
152
152
|
self.num_hidden_layers = encoder_layers
|
|
153
153
|
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
|
154
|
+
|
|
154
155
|
super().__init__(
|
|
155
156
|
pad_token_id=pad_token_id,
|
|
156
157
|
bos_token_id=bos_token_id,
|
|
@@ -27,6 +27,7 @@ import torch
|
|
|
27
27
|
from torch import nn
|
|
28
28
|
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
29
29
|
|
|
30
|
+
from ... import initialization as init
|
|
30
31
|
from ...activations import ACT2FN
|
|
31
32
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
32
33
|
from ...generation import GenerationMixin
|
|
@@ -73,6 +74,11 @@ class PLBartPreTrainedModel(PreTrainedModel):
|
|
|
73
74
|
_supports_sdpa = True
|
|
74
75
|
_supports_flex_attn = True
|
|
75
76
|
|
|
77
|
+
def _init_weights(self, module):
|
|
78
|
+
super()._init_weights(module)
|
|
79
|
+
if isinstance(module, PLBartForConditionalGeneration):
|
|
80
|
+
init.zeros_(module.final_logits_bias)
|
|
81
|
+
|
|
76
82
|
|
|
77
83
|
class PLBartLearnedPositionalEmbedding(nn.Embedding):
|
|
78
84
|
"""
|
|
@@ -366,6 +372,7 @@ class PLBartEncoder(PLBartPreTrainedModel):
|
|
|
366
372
|
output_attentions: Optional[bool] = None,
|
|
367
373
|
output_hidden_states: Optional[bool] = None,
|
|
368
374
|
return_dict: Optional[bool] = None,
|
|
375
|
+
**kwargs,
|
|
369
376
|
) -> Union[tuple, BaseModelOutput]:
|
|
370
377
|
r"""
|
|
371
378
|
Args:
|
|
@@ -621,6 +628,7 @@ class PLBartDecoder(PLBartPreTrainedModel):
|
|
|
621
628
|
output_hidden_states: Optional[bool] = None,
|
|
622
629
|
return_dict: Optional[bool] = None,
|
|
623
630
|
cache_position: Optional[torch.LongTensor] = None,
|
|
631
|
+
**kwargs,
|
|
624
632
|
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
|
625
633
|
r"""
|
|
626
634
|
Args:
|
|
@@ -867,6 +875,7 @@ class PLBartModel(PLBartPreTrainedModel):
|
|
|
867
875
|
output_hidden_states: Optional[bool] = None,
|
|
868
876
|
return_dict: Optional[bool] = None,
|
|
869
877
|
cache_position: Optional[torch.LongTensor] = None,
|
|
878
|
+
**kwargs,
|
|
870
879
|
) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
|
|
871
880
|
r"""
|
|
872
881
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1002,6 +1011,7 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel, GenerationMixin):
|
|
|
1002
1011
|
output_hidden_states: Optional[bool] = None,
|
|
1003
1012
|
return_dict: Optional[bool] = None,
|
|
1004
1013
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1014
|
+
**kwargs,
|
|
1005
1015
|
) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
|
|
1006
1016
|
r"""
|
|
1007
1017
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1159,6 +1169,7 @@ class PLBartForSequenceClassification(PLBartPreTrainedModel):
|
|
|
1159
1169
|
output_hidden_states: Optional[bool] = None,
|
|
1160
1170
|
return_dict: Optional[bool] = None,
|
|
1161
1171
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1172
|
+
**kwargs,
|
|
1162
1173
|
) -> Union[tuple, Seq2SeqSequenceClassifierOutput]:
|
|
1163
1174
|
r"""
|
|
1164
1175
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1268,6 +1279,7 @@ class PLBartDecoderWrapper(PLBartPreTrainedModel):
|
|
|
1268
1279
|
def __init__(self, config):
|
|
1269
1280
|
super().__init__(config)
|
|
1270
1281
|
self.decoder = PLBartDecoder(config)
|
|
1282
|
+
self.post_init()
|
|
1271
1283
|
|
|
1272
1284
|
def forward(self, *args, **kwargs):
|
|
1273
1285
|
return self.decoder(*args, **kwargs)
|
|
@@ -1316,6 +1328,7 @@ class PLBartForCausalLM(PLBartPreTrainedModel, GenerationMixin):
|
|
|
1316
1328
|
return_dict: Optional[bool] = None,
|
|
1317
1329
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1318
1330
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
1331
|
+
**kwargs,
|
|
1319
1332
|
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
|
|
1320
1333
|
r"""
|
|
1321
1334
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -21,6 +21,7 @@ import torch
|
|
|
21
21
|
from torch import nn
|
|
22
22
|
from torch.nn import CrossEntropyLoss
|
|
23
23
|
|
|
24
|
+
from ... import initialization as init
|
|
24
25
|
from ...cache_utils import Cache
|
|
25
26
|
from ...generation import GenerationMixin
|
|
26
27
|
from ...modeling_outputs import (
|
|
@@ -56,6 +57,11 @@ class PLBartPreTrainedModel(PreTrainedModel):
|
|
|
56
57
|
_supports_sdpa = True
|
|
57
58
|
_supports_flex_attn = True
|
|
58
59
|
|
|
60
|
+
def _init_weights(self, module):
|
|
61
|
+
super()._init_weights(module)
|
|
62
|
+
if isinstance(module, PLBartForConditionalGeneration):
|
|
63
|
+
init.zeros_(module.final_logits_bias)
|
|
64
|
+
|
|
59
65
|
|
|
60
66
|
class PLBartEncoder(BartEncoder):
|
|
61
67
|
pass
|
|
@@ -108,6 +114,7 @@ class PLBartModel(PLBartPreTrainedModel):
|
|
|
108
114
|
output_hidden_states: Optional[bool] = None,
|
|
109
115
|
return_dict: Optional[bool] = None,
|
|
110
116
|
cache_position: Optional[torch.LongTensor] = None,
|
|
117
|
+
**kwargs,
|
|
111
118
|
) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
|
|
112
119
|
r"""
|
|
113
120
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -243,6 +250,7 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel, GenerationMixin):
|
|
|
243
250
|
output_hidden_states: Optional[bool] = None,
|
|
244
251
|
return_dict: Optional[bool] = None,
|
|
245
252
|
cache_position: Optional[torch.LongTensor] = None,
|
|
253
|
+
**kwargs,
|
|
246
254
|
) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
|
|
247
255
|
r"""
|
|
248
256
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -125,7 +125,6 @@ class PLBartTokenizer(SentencePieceBackend):
|
|
|
125
125
|
pad_token="<pad>",
|
|
126
126
|
mask_token="<mask>",
|
|
127
127
|
language_codes="base",
|
|
128
|
-
tokenizer_file=None,
|
|
129
128
|
src_lang=None,
|
|
130
129
|
tgt_lang=None,
|
|
131
130
|
sp_model_kwargs: Optional[dict[str, Any]] = None,
|
|
@@ -171,7 +170,6 @@ class PLBartTokenizer(SentencePieceBackend):
|
|
|
171
170
|
cls_token=cls_token,
|
|
172
171
|
pad_token=pad_token,
|
|
173
172
|
mask_token=mask_token,
|
|
174
|
-
tokenizer_file=tokenizer_file,
|
|
175
173
|
src_lang=src_lang,
|
|
176
174
|
tgt_lang=tgt_lang,
|
|
177
175
|
additional_special_tokens=_additional_special_tokens,
|
|
@@ -231,7 +231,6 @@ class PoolFormerImageProcessorFast(BaseImageProcessorFast):
|
|
|
231
231
|
processed_images_grouped[shape] = stacked_images
|
|
232
232
|
|
|
233
233
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
234
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
235
234
|
|
|
236
235
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
237
236
|
|
|
@@ -268,7 +268,11 @@ class PoolFormerModel(PoolFormerPreTrainedModel):
|
|
|
268
268
|
self.post_init()
|
|
269
269
|
|
|
270
270
|
def get_input_embeddings(self):
|
|
271
|
-
|
|
271
|
+
# Input embeddings correspond to the very first patch-embedding stage.
|
|
272
|
+
return self.encoder.patch_embeddings[0]
|
|
273
|
+
|
|
274
|
+
def set_input_embeddings(self, value):
|
|
275
|
+
self.encoder.patch_embeddings[0] = value
|
|
272
276
|
|
|
273
277
|
@auto_docstring
|
|
274
278
|
def forward(
|
|
@@ -276,6 +280,7 @@ class PoolFormerModel(PoolFormerPreTrainedModel):
|
|
|
276
280
|
pixel_values: Optional[torch.FloatTensor] = None,
|
|
277
281
|
output_hidden_states: Optional[bool] = None,
|
|
278
282
|
return_dict: Optional[bool] = None,
|
|
283
|
+
**kwargs,
|
|
279
284
|
) -> Union[tuple, BaseModelOutputWithNoAttention]:
|
|
280
285
|
output_hidden_states = (
|
|
281
286
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
@@ -332,6 +337,12 @@ class PoolFormerForImageClassification(PoolFormerPreTrainedModel):
|
|
|
332
337
|
# Initialize weights and apply final processing
|
|
333
338
|
self.post_init()
|
|
334
339
|
|
|
340
|
+
def get_input_embeddings(self):
|
|
341
|
+
return self.poolformer.get_input_embeddings()
|
|
342
|
+
|
|
343
|
+
def set_input_embeddings(self, value):
|
|
344
|
+
self.poolformer.set_input_embeddings(value)
|
|
345
|
+
|
|
335
346
|
@auto_docstring
|
|
336
347
|
def forward(
|
|
337
348
|
self,
|
|
@@ -339,6 +350,7 @@ class PoolFormerForImageClassification(PoolFormerPreTrainedModel):
|
|
|
339
350
|
labels: Optional[torch.LongTensor] = None,
|
|
340
351
|
output_hidden_states: Optional[bool] = None,
|
|
341
352
|
return_dict: Optional[bool] = None,
|
|
353
|
+
**kwargs,
|
|
342
354
|
) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
|
|
343
355
|
r"""
|
|
344
356
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -644,6 +644,7 @@ class Pop2PianoStack(Pop2PianoPreTrainedModel):
|
|
|
644
644
|
output_hidden_states=None,
|
|
645
645
|
return_dict=None,
|
|
646
646
|
cache_position=None,
|
|
647
|
+
**kwargs,
|
|
647
648
|
):
|
|
648
649
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
|
649
650
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
@@ -1051,6 +1052,7 @@ class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel, GenerationMixi
|
|
|
1051
1052
|
output_hidden_states: Optional[bool] = None,
|
|
1052
1053
|
return_dict: Optional[bool] = None,
|
|
1053
1054
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1055
|
+
**kwargs,
|
|
1054
1056
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
|
|
1055
1057
|
r"""
|
|
1056
1058
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -37,9 +37,8 @@ class PromptDepthAnythingConfig(PreTrainedConfig):
|
|
|
37
37
|
documentation from [`PreTrainedConfig`] for more information.
|
|
38
38
|
|
|
39
39
|
Args:
|
|
40
|
-
backbone_config (`Union[dict
|
|
41
|
-
The configuration of the backbone model.
|
|
42
|
-
leverage the [`AutoBackbone`] API.
|
|
40
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `Dinov2Config()`):
|
|
41
|
+
The configuration of the backbone model.
|
|
43
42
|
backbone (`str`, *optional*):
|
|
44
43
|
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
45
44
|
will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
|