transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -522,6 +522,14 @@ class BrosPreTrainedModel(PreTrainedModel):
|
|
|
522
522
|
std = self.config.initializer_range
|
|
523
523
|
if isinstance(module, BrosRelationExtractor):
|
|
524
524
|
init.normal_(module.dummy_node, std=std)
|
|
525
|
+
elif isinstance(module, BrosTextEmbeddings):
|
|
526
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
527
|
+
init.zeros_(module.token_type_ids)
|
|
528
|
+
elif isinstance(module, BrosPositionalEmbedding1D):
|
|
529
|
+
inv_freq = 1 / (
|
|
530
|
+
10000 ** (torch.arange(0.0, module.dim_bbox_sinusoid_emb_1d, 2.0) / module.dim_bbox_sinusoid_emb_1d)
|
|
531
|
+
)
|
|
532
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
525
533
|
|
|
526
534
|
|
|
527
535
|
@auto_docstring
|
|
@@ -563,6 +571,7 @@ class BrosModel(BrosPreTrainedModel):
|
|
|
563
571
|
output_attentions: Optional[bool] = None,
|
|
564
572
|
output_hidden_states: Optional[bool] = None,
|
|
565
573
|
return_dict: Optional[bool] = None,
|
|
574
|
+
**kwargs,
|
|
566
575
|
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
|
567
576
|
r"""
|
|
568
577
|
bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
|
|
@@ -701,6 +710,7 @@ class BrosForTokenClassification(BrosPreTrainedModel):
|
|
|
701
710
|
output_attentions: Optional[bool] = None,
|
|
702
711
|
output_hidden_states: Optional[bool] = None,
|
|
703
712
|
return_dict: Optional[bool] = None,
|
|
713
|
+
**kwargs,
|
|
704
714
|
) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
|
|
705
715
|
r"""
|
|
706
716
|
bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
|
|
@@ -821,6 +831,7 @@ class BrosSpadeEEForTokenClassification(BrosPreTrainedModel):
|
|
|
821
831
|
output_attentions: Optional[bool] = None,
|
|
822
832
|
output_hidden_states: Optional[bool] = None,
|
|
823
833
|
return_dict: Optional[bool] = None,
|
|
834
|
+
**kwargs,
|
|
824
835
|
) -> Union[tuple[torch.Tensor], BrosSpadeOutput]:
|
|
825
836
|
r"""
|
|
826
837
|
bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
|
|
@@ -957,6 +968,7 @@ class BrosSpadeELForTokenClassification(BrosPreTrainedModel):
|
|
|
957
968
|
output_attentions: Optional[bool] = None,
|
|
958
969
|
output_hidden_states: Optional[bool] = None,
|
|
959
970
|
return_dict: Optional[bool] = None,
|
|
971
|
+
**kwargs,
|
|
960
972
|
) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
|
|
961
973
|
r"""
|
|
962
974
|
bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
|
|
@@ -54,6 +54,112 @@ from .configuration_camembert import CamembertConfig
|
|
|
54
54
|
logger = logging.get_logger(__name__)
|
|
55
55
|
|
|
56
56
|
|
|
57
|
+
class CamembertEmbeddings(nn.Module):
|
|
58
|
+
"""Construct the embeddings from word, position and token_type embeddings."""
|
|
59
|
+
|
|
60
|
+
def __init__(self, config):
|
|
61
|
+
super().__init__()
|
|
62
|
+
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
|
|
63
|
+
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
|
|
64
|
+
|
|
65
|
+
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
66
|
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
67
|
+
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
|
68
|
+
self.register_buffer(
|
|
69
|
+
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
|
70
|
+
)
|
|
71
|
+
self.register_buffer(
|
|
72
|
+
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
self.padding_idx = config.pad_token_id
|
|
76
|
+
self.position_embeddings = nn.Embedding(
|
|
77
|
+
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def forward(
|
|
81
|
+
self,
|
|
82
|
+
input_ids: Optional[torch.LongTensor] = None,
|
|
83
|
+
token_type_ids: Optional[torch.LongTensor] = None,
|
|
84
|
+
position_ids: Optional[torch.LongTensor] = None,
|
|
85
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
86
|
+
past_key_values_length: int = 0,
|
|
87
|
+
) -> torch.Tensor:
|
|
88
|
+
if position_ids is None:
|
|
89
|
+
if input_ids is not None:
|
|
90
|
+
# Create the position ids from the input token ids. Any padded tokens remain padded.
|
|
91
|
+
position_ids = self.create_position_ids_from_input_ids(
|
|
92
|
+
input_ids, self.padding_idx, past_key_values_length
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, self.padding_idx)
|
|
96
|
+
|
|
97
|
+
if input_ids is not None:
|
|
98
|
+
input_shape = input_ids.size()
|
|
99
|
+
else:
|
|
100
|
+
input_shape = inputs_embeds.size()[:-1]
|
|
101
|
+
|
|
102
|
+
batch_size, seq_length = input_shape
|
|
103
|
+
|
|
104
|
+
# Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
|
|
105
|
+
# when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
|
|
106
|
+
# issue #5664
|
|
107
|
+
if token_type_ids is None:
|
|
108
|
+
if hasattr(self, "token_type_ids"):
|
|
109
|
+
# NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
|
|
110
|
+
buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
|
|
111
|
+
buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
|
|
112
|
+
token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
|
|
113
|
+
else:
|
|
114
|
+
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
|
|
115
|
+
|
|
116
|
+
if inputs_embeds is None:
|
|
117
|
+
inputs_embeds = self.word_embeddings(input_ids)
|
|
118
|
+
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
|
119
|
+
embeddings = inputs_embeds + token_type_embeddings
|
|
120
|
+
|
|
121
|
+
position_embeddings = self.position_embeddings(position_ids)
|
|
122
|
+
embeddings = embeddings + position_embeddings
|
|
123
|
+
|
|
124
|
+
embeddings = self.LayerNorm(embeddings)
|
|
125
|
+
embeddings = self.dropout(embeddings)
|
|
126
|
+
return embeddings
|
|
127
|
+
|
|
128
|
+
@staticmethod
|
|
129
|
+
def create_position_ids_from_inputs_embeds(inputs_embeds, padding_idx):
|
|
130
|
+
"""
|
|
131
|
+
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
inputs_embeds: torch.Tensor
|
|
135
|
+
|
|
136
|
+
Returns: torch.Tensor
|
|
137
|
+
"""
|
|
138
|
+
input_shape = inputs_embeds.size()[:-1]
|
|
139
|
+
sequence_length = input_shape[1]
|
|
140
|
+
|
|
141
|
+
position_ids = torch.arange(
|
|
142
|
+
padding_idx + 1, sequence_length + padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
|
|
143
|
+
)
|
|
144
|
+
return position_ids.unsqueeze(0).expand(input_shape)
|
|
145
|
+
|
|
146
|
+
@staticmethod
|
|
147
|
+
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
|
|
148
|
+
"""
|
|
149
|
+
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
|
|
150
|
+
are ignored. This is modified from fairseq's `utils.make_positions`.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
x: torch.Tensor x:
|
|
154
|
+
|
|
155
|
+
Returns: torch.Tensor
|
|
156
|
+
"""
|
|
157
|
+
# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
|
|
158
|
+
mask = input_ids.ne(padding_idx).int()
|
|
159
|
+
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
|
|
160
|
+
return incremental_indices.long() + padding_idx
|
|
161
|
+
|
|
162
|
+
|
|
57
163
|
def eager_attention_forward(
|
|
58
164
|
module: nn.Module,
|
|
59
165
|
query: torch.Tensor,
|
|
@@ -417,112 +523,9 @@ class CamembertPreTrainedModel(PreTrainedModel):
|
|
|
417
523
|
super()._init_weights(module)
|
|
418
524
|
if isinstance(module, CamembertLMHead):
|
|
419
525
|
init.zeros_(module.bias)
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
"""Construct the embeddings from word, position and token_type embeddings."""
|
|
424
|
-
|
|
425
|
-
def __init__(self, config):
|
|
426
|
-
super().__init__()
|
|
427
|
-
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
|
|
428
|
-
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
|
|
429
|
-
|
|
430
|
-
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
431
|
-
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
432
|
-
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
|
433
|
-
self.register_buffer(
|
|
434
|
-
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
|
435
|
-
)
|
|
436
|
-
self.register_buffer(
|
|
437
|
-
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
|
|
438
|
-
)
|
|
439
|
-
|
|
440
|
-
self.padding_idx = config.pad_token_id
|
|
441
|
-
self.position_embeddings = nn.Embedding(
|
|
442
|
-
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
|
|
443
|
-
)
|
|
444
|
-
|
|
445
|
-
def forward(
|
|
446
|
-
self,
|
|
447
|
-
input_ids: Optional[torch.LongTensor] = None,
|
|
448
|
-
token_type_ids: Optional[torch.LongTensor] = None,
|
|
449
|
-
position_ids: Optional[torch.LongTensor] = None,
|
|
450
|
-
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
451
|
-
past_key_values_length: int = 0,
|
|
452
|
-
) -> torch.Tensor:
|
|
453
|
-
if position_ids is None:
|
|
454
|
-
if input_ids is not None:
|
|
455
|
-
# Create the position ids from the input token ids. Any padded tokens remain padded.
|
|
456
|
-
position_ids = self.create_position_ids_from_input_ids(
|
|
457
|
-
input_ids, self.padding_idx, past_key_values_length
|
|
458
|
-
)
|
|
459
|
-
else:
|
|
460
|
-
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, self.padding_idx)
|
|
461
|
-
|
|
462
|
-
if input_ids is not None:
|
|
463
|
-
input_shape = input_ids.size()
|
|
464
|
-
else:
|
|
465
|
-
input_shape = inputs_embeds.size()[:-1]
|
|
466
|
-
|
|
467
|
-
batch_size, seq_length = input_shape
|
|
468
|
-
|
|
469
|
-
# Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
|
|
470
|
-
# when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
|
|
471
|
-
# issue #5664
|
|
472
|
-
if token_type_ids is None:
|
|
473
|
-
if hasattr(self, "token_type_ids"):
|
|
474
|
-
# NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
|
|
475
|
-
buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
|
|
476
|
-
buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
|
|
477
|
-
token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
|
|
478
|
-
else:
|
|
479
|
-
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
|
|
480
|
-
|
|
481
|
-
if inputs_embeds is None:
|
|
482
|
-
inputs_embeds = self.word_embeddings(input_ids)
|
|
483
|
-
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
|
484
|
-
embeddings = inputs_embeds + token_type_embeddings
|
|
485
|
-
|
|
486
|
-
position_embeddings = self.position_embeddings(position_ids)
|
|
487
|
-
embeddings = embeddings + position_embeddings
|
|
488
|
-
|
|
489
|
-
embeddings = self.LayerNorm(embeddings)
|
|
490
|
-
embeddings = self.dropout(embeddings)
|
|
491
|
-
return embeddings
|
|
492
|
-
|
|
493
|
-
@staticmethod
|
|
494
|
-
def create_position_ids_from_inputs_embeds(inputs_embeds, padding_idx):
|
|
495
|
-
"""
|
|
496
|
-
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
|
|
497
|
-
|
|
498
|
-
Args:
|
|
499
|
-
inputs_embeds: torch.Tensor
|
|
500
|
-
|
|
501
|
-
Returns: torch.Tensor
|
|
502
|
-
"""
|
|
503
|
-
input_shape = inputs_embeds.size()[:-1]
|
|
504
|
-
sequence_length = input_shape[1]
|
|
505
|
-
|
|
506
|
-
position_ids = torch.arange(
|
|
507
|
-
padding_idx + 1, sequence_length + padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
|
|
508
|
-
)
|
|
509
|
-
return position_ids.unsqueeze(0).expand(input_shape)
|
|
510
|
-
|
|
511
|
-
@staticmethod
|
|
512
|
-
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
|
|
513
|
-
"""
|
|
514
|
-
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
|
|
515
|
-
are ignored. This is modified from fairseq's `utils.make_positions`.
|
|
516
|
-
|
|
517
|
-
Args:
|
|
518
|
-
x: torch.Tensor x:
|
|
519
|
-
|
|
520
|
-
Returns: torch.Tensor
|
|
521
|
-
"""
|
|
522
|
-
# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
|
|
523
|
-
mask = input_ids.ne(padding_idx).int()
|
|
524
|
-
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
|
|
525
|
-
return incremental_indices.long() + padding_idx
|
|
526
|
+
elif isinstance(module, CamembertEmbeddings):
|
|
527
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
528
|
+
init.zeros_(module.token_type_ids)
|
|
526
529
|
|
|
527
530
|
|
|
528
531
|
class CamembertEncoder(nn.Module):
|
|
@@ -14,6 +14,8 @@
|
|
|
14
14
|
# limitations under the License
|
|
15
15
|
"""Tokenization classes for Camembert model."""
|
|
16
16
|
|
|
17
|
+
from typing import Optional, Union
|
|
18
|
+
|
|
17
19
|
from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
|
|
18
20
|
from tokenizers.models import Unigram
|
|
19
21
|
|
|
@@ -83,7 +85,7 @@ class CamembertTokenizer(TokenizersBackend):
|
|
|
83
85
|
vocab_file (`str`, *optional*):
|
|
84
86
|
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
|
|
85
87
|
contains the vocabulary necessary to instantiate a tokenizer.
|
|
86
|
-
vocab (`dict`, *optional*):
|
|
88
|
+
vocab (`str`, `dict` or `list`, *optional*):
|
|
87
89
|
Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
|
|
88
90
|
"""
|
|
89
91
|
|
|
@@ -103,7 +105,7 @@ class CamembertTokenizer(TokenizersBackend):
|
|
|
103
105
|
additional_special_tokens=None,
|
|
104
106
|
add_prefix_space=True,
|
|
105
107
|
vocab_file=None,
|
|
106
|
-
vocab=None,
|
|
108
|
+
vocab: Optional[Union[str, dict, list]] = None,
|
|
107
109
|
**kwargs,
|
|
108
110
|
):
|
|
109
111
|
self.vocab_file = vocab_file
|
|
@@ -114,9 +116,9 @@ class CamembertTokenizer(TokenizersBackend):
|
|
|
114
116
|
if additional_special_tokens is None:
|
|
115
117
|
additional_special_tokens = ["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"]
|
|
116
118
|
|
|
117
|
-
if vocab is not None
|
|
118
|
-
self._vocab =
|
|
119
|
-
unk_index = next(i for i, (tok, _) in enumerate(self._vocab) if tok == str(unk_token))
|
|
119
|
+
if vocab is not None:
|
|
120
|
+
self._vocab = vocab
|
|
121
|
+
unk_index = next((i for i, (tok, _) in enumerate(self._vocab) if tok == str(unk_token)), 0)
|
|
120
122
|
self._tokenizer = Tokenizer(Unigram(self._vocab, unk_id=unk_index, byte_fallback=False))
|
|
121
123
|
else:
|
|
122
124
|
self._vocab = [
|
|
@@ -131,11 +133,8 @@ class CamembertTokenizer(TokenizersBackend):
|
|
|
131
133
|
|
|
132
134
|
self._tokenizer.normalizer = normalizers.Sequence(
|
|
133
135
|
[
|
|
134
|
-
normalizers.Replace("\n", " "),
|
|
135
|
-
normalizers.Replace("\r", " "),
|
|
136
|
-
normalizers.Replace("\t", " "),
|
|
136
|
+
normalizers.Replace(Regex(r"\s{2,}|[\n\r\t]"), " "),
|
|
137
137
|
normalizers.Strip(left=False, right=True),
|
|
138
|
-
normalizers.Replace(Regex(" {2,}"), "▁"),
|
|
139
138
|
]
|
|
140
139
|
)
|
|
141
140
|
|
|
@@ -143,10 +142,7 @@ class CamembertTokenizer(TokenizersBackend):
|
|
|
143
142
|
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
|
|
144
143
|
self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
|
|
145
144
|
|
|
146
|
-
tokenizer_object = self._tokenizer
|
|
147
|
-
|
|
148
145
|
super().__init__(
|
|
149
|
-
tokenizer_object=tokenizer_object,
|
|
150
146
|
bos_token=bos_token,
|
|
151
147
|
eos_token=eos_token,
|
|
152
148
|
sep_token=sep_token,
|
|
@@ -23,6 +23,7 @@ import torch
|
|
|
23
23
|
from torch import nn
|
|
24
24
|
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
25
25
|
|
|
26
|
+
from ... import initialization as init
|
|
26
27
|
from ...activations import ACT2FN
|
|
27
28
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
28
29
|
from ...modeling_outputs import (
|
|
@@ -719,6 +720,11 @@ class CaninePreTrainedModel(PreTrainedModel):
|
|
|
719
720
|
base_model_prefix = "canine"
|
|
720
721
|
supports_gradient_checkpointing = True
|
|
721
722
|
|
|
723
|
+
def _init_weights(self, module):
|
|
724
|
+
super()._init_weights(module)
|
|
725
|
+
if isinstance(module, CanineEmbeddings):
|
|
726
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
727
|
+
|
|
722
728
|
|
|
723
729
|
@auto_docstring
|
|
724
730
|
class CanineModel(CaninePreTrainedModel):
|
|
@@ -836,6 +842,7 @@ class CanineModel(CaninePreTrainedModel):
|
|
|
836
842
|
output_attentions: Optional[bool] = None,
|
|
837
843
|
output_hidden_states: Optional[bool] = None,
|
|
838
844
|
return_dict: Optional[bool] = None,
|
|
845
|
+
**kwargs,
|
|
839
846
|
) -> Union[tuple, CanineModelOutputWithPooling]:
|
|
840
847
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
841
848
|
output_hidden_states = (
|
|
@@ -1006,6 +1013,7 @@ class CanineForSequenceClassification(CaninePreTrainedModel):
|
|
|
1006
1013
|
output_attentions: Optional[bool] = None,
|
|
1007
1014
|
output_hidden_states: Optional[bool] = None,
|
|
1008
1015
|
return_dict: Optional[bool] = None,
|
|
1016
|
+
**kwargs,
|
|
1009
1017
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1010
1018
|
r"""
|
|
1011
1019
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1089,6 +1097,7 @@ class CanineForMultipleChoice(CaninePreTrainedModel):
|
|
|
1089
1097
|
output_attentions: Optional[bool] = None,
|
|
1090
1098
|
output_hidden_states: Optional[bool] = None,
|
|
1091
1099
|
return_dict: Optional[bool] = None,
|
|
1100
|
+
**kwargs,
|
|
1092
1101
|
) -> Union[tuple, MultipleChoiceModelOutput]:
|
|
1093
1102
|
r"""
|
|
1094
1103
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -1192,6 +1201,7 @@ class CanineForTokenClassification(CaninePreTrainedModel):
|
|
|
1192
1201
|
output_attentions: Optional[bool] = None,
|
|
1193
1202
|
output_hidden_states: Optional[bool] = None,
|
|
1194
1203
|
return_dict: Optional[bool] = None,
|
|
1204
|
+
**kwargs,
|
|
1195
1205
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1196
1206
|
r"""
|
|
1197
1207
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1287,6 +1297,7 @@ class CanineForQuestionAnswering(CaninePreTrainedModel):
|
|
|
1287
1297
|
output_attentions: Optional[bool] = None,
|
|
1288
1298
|
output_hidden_states: Optional[bool] = None,
|
|
1289
1299
|
return_dict: Optional[bool] = None,
|
|
1300
|
+
**kwargs,
|
|
1290
1301
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
1291
1302
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
1292
1303
|
|
|
@@ -38,6 +38,7 @@ from ...utils import (
|
|
|
38
38
|
can_return_tuple,
|
|
39
39
|
logging,
|
|
40
40
|
)
|
|
41
|
+
from ...utils.generic import maybe_autocast
|
|
41
42
|
from .configuration_chameleon import ChameleonConfig, ChameleonVQVAEConfig
|
|
42
43
|
|
|
43
44
|
|
|
@@ -83,7 +84,7 @@ class ChameleonRotaryEmbedding(nn.Module):
|
|
|
83
84
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
84
85
|
|
|
85
86
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
86
|
-
self.original_inv_freq =
|
|
87
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
87
88
|
|
|
88
89
|
@staticmethod
|
|
89
90
|
def compute_default_rope_parameters(
|
|
@@ -122,7 +123,7 @@ class ChameleonRotaryEmbedding(nn.Module):
|
|
|
122
123
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
123
124
|
|
|
124
125
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
125
|
-
with
|
|
126
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
126
127
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
127
128
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
128
129
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -808,6 +809,7 @@ class ChameleonVQVAE(ChameleonPreTrainedModel):
|
|
|
808
809
|
self.quant_conv = torch.nn.Conv2d(config.latent_channels, config.embed_dim, 1)
|
|
809
810
|
self.post_quant_conv = torch.nn.Conv2d(config.embed_dim, config.latent_channels, 1)
|
|
810
811
|
self.eval() # Chameleon's VQ model is frozen
|
|
812
|
+
self.post_init()
|
|
811
813
|
|
|
812
814
|
def encode(self, pixel_values: torch.LongTensor):
|
|
813
815
|
hidden_states = self.encoder(pixel_values)
|
|
@@ -1121,6 +1123,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
|
|
|
1121
1123
|
cache_position=None,
|
|
1122
1124
|
position_ids=None,
|
|
1123
1125
|
use_cache=True,
|
|
1126
|
+
is_first_iteration=False,
|
|
1124
1127
|
**kwargs,
|
|
1125
1128
|
):
|
|
1126
1129
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1134,12 +1137,15 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
|
|
|
1134
1137
|
cache_position=cache_position,
|
|
1135
1138
|
position_ids=position_ids,
|
|
1136
1139
|
use_cache=use_cache,
|
|
1140
|
+
is_first_iteration=is_first_iteration,
|
|
1137
1141
|
**kwargs,
|
|
1138
1142
|
)
|
|
1139
1143
|
|
|
1140
|
-
if
|
|
1141
|
-
#
|
|
1142
|
-
#
|
|
1144
|
+
if not is_first_iteration and use_cache:
|
|
1145
|
+
# Pixel values are used only in the first iteration if available
|
|
1146
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
1147
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
1148
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
1143
1149
|
model_inputs["pixel_values"] = None
|
|
1144
1150
|
|
|
1145
1151
|
return model_inputs
|
|
@@ -572,10 +572,13 @@ class ChineseCLIPPreTrainedModel(PreTrainedModel):
|
|
|
572
572
|
init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
|
|
573
573
|
init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
|
|
574
574
|
init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
|
|
575
|
+
init.copy_(module.position_ids, torch.arange(module.num_positions).expand((1, -1)))
|
|
575
576
|
elif isinstance(module, ChineseCLIPTextEmbeddings):
|
|
576
577
|
init.normal_(module.word_embeddings.weight, mean=0.0, std=self.config.initializer_range)
|
|
577
578
|
init.normal_(module.position_embeddings.weight, mean=0.0, std=self.config.initializer_range)
|
|
578
579
|
init.normal_(module.token_type_embeddings.weight, mean=0.0, std=self.config.initializer_range)
|
|
580
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
581
|
+
init.zeros_(module.token_type_ids)
|
|
579
582
|
for embedding in [module.word_embeddings, module.position_embeddings, module.token_type_embeddings]:
|
|
580
583
|
if embedding.padding_idx is not None:
|
|
581
584
|
init.zeros_(embedding.weight[embedding.padding_idx])
|
|
@@ -638,9 +641,9 @@ class ChineseCLIPTextEncoder(nn.Module):
|
|
|
638
641
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
|
639
642
|
|
|
640
643
|
layer_outputs = layer_module(
|
|
641
|
-
hidden_states
|
|
642
|
-
attention_mask
|
|
643
|
-
output_attentions
|
|
644
|
+
hidden_states,
|
|
645
|
+
attention_mask,
|
|
646
|
+
output_attentions,
|
|
644
647
|
**kwargs,
|
|
645
648
|
)
|
|
646
649
|
|
|
@@ -839,6 +842,7 @@ class ChineseCLIPTextModel(ChineseCLIPPreTrainedModel):
|
|
|
839
842
|
output_attentions: Optional[bool] = None,
|
|
840
843
|
output_hidden_states: Optional[bool] = None,
|
|
841
844
|
return_dict: Optional[bool] = None,
|
|
845
|
+
**kwargs,
|
|
842
846
|
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPooling]:
|
|
843
847
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
844
848
|
output_hidden_states = (
|
|
@@ -926,6 +930,7 @@ class ChineseCLIPVisionModel(ChineseCLIPPreTrainedModel):
|
|
|
926
930
|
output_hidden_states: Optional[bool] = None,
|
|
927
931
|
interpolate_pos_encoding: bool = False,
|
|
928
932
|
return_dict: Optional[bool] = None,
|
|
933
|
+
**kwargs,
|
|
929
934
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
930
935
|
r"""
|
|
931
936
|
Examples:
|
|
@@ -1091,6 +1096,7 @@ class ChineseCLIPModel(ChineseCLIPPreTrainedModel):
|
|
|
1091
1096
|
output_hidden_states: Optional[bool] = None,
|
|
1092
1097
|
interpolate_pos_encoding: bool = False,
|
|
1093
1098
|
return_dict: Optional[bool] = None,
|
|
1099
|
+
**kwargs,
|
|
1094
1100
|
) -> Union[tuple, ChineseCLIPOutput]:
|
|
1095
1101
|
r"""
|
|
1096
1102
|
return_loss (`bool`, *optional*):
|
|
@@ -71,7 +71,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
|
|
|
71
71
|
Truncation pattern for long audio inputs. Two patterns are available:
|
|
72
72
|
- `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a
|
|
73
73
|
downsampled version of the entire mel spectrogram.
|
|
74
|
-
If `config.fusion` is set to True, shorter audios also need to
|
|
74
|
+
If `config.fusion` is set to True, shorter audios also need to return 4 mels, which will just be a copy
|
|
75
75
|
of the original mel obtained from the padded audio.
|
|
76
76
|
- `rand_trunc` will select a random crop of the mel spectrogram.
|
|
77
77
|
padding (`str`, *optional*, defaults to `"repeatpad"`):
|
|
@@ -279,7 +279,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
|
|
|
279
279
|
Truncation pattern for long audio inputs. Two patterns are available:
|
|
280
280
|
- `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and
|
|
281
281
|
a downsampled version of the entire mel spectrogram.
|
|
282
|
-
If `config.fusion` is set to True, shorter audios also need to
|
|
282
|
+
If `config.fusion` is set to True, shorter audios also need to return 4 mels, which will just be a
|
|
283
283
|
copy of the original mel obtained from the padded audio.
|
|
284
284
|
- `rand_trunc` will select a random crop of the mel spectrogram.
|
|
285
285
|
padding (`str`, *optional*):
|
|
@@ -365,18 +365,7 @@ class ClapAudioSelfAttention(nn.Module):
|
|
|
365
365
|
torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
|
|
366
366
|
)
|
|
367
367
|
|
|
368
|
-
|
|
369
|
-
coords_h = torch.arange(self.window_size[0])
|
|
370
|
-
coords_w = torch.arange(self.window_size[1])
|
|
371
|
-
coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
|
|
372
|
-
coords_flatten = torch.flatten(coords, 1)
|
|
373
|
-
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
|
|
374
|
-
relative_coords = relative_coords.permute(1, 2, 0).contiguous()
|
|
375
|
-
relative_coords[:, :, 0] += self.window_size[0] - 1
|
|
376
|
-
relative_coords[:, :, 1] += self.window_size[1] - 1
|
|
377
|
-
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
|
|
378
|
-
relative_position_index = relative_coords.sum(-1)
|
|
379
|
-
self.register_buffer("relative_position_index", relative_position_index)
|
|
368
|
+
self.register_buffer("relative_position_index", self.create_relative_position_index())
|
|
380
369
|
|
|
381
370
|
self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
|
|
382
371
|
self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
|
|
@@ -435,6 +424,20 @@ class ClapAudioSelfAttention(nn.Module):
|
|
|
435
424
|
|
|
436
425
|
return outputs
|
|
437
426
|
|
|
427
|
+
def create_relative_position_index(self):
|
|
428
|
+
# get pair-wise relative position index for each token inside the window
|
|
429
|
+
coords_h = torch.arange(self.window_size[0])
|
|
430
|
+
coords_w = torch.arange(self.window_size[1])
|
|
431
|
+
coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
|
|
432
|
+
coords_flatten = torch.flatten(coords, 1)
|
|
433
|
+
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
|
|
434
|
+
relative_coords = relative_coords.permute(1, 2, 0).contiguous()
|
|
435
|
+
relative_coords[:, :, 0] += self.window_size[0] - 1
|
|
436
|
+
relative_coords[:, :, 1] += self.window_size[1] - 1
|
|
437
|
+
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
|
|
438
|
+
relative_position_index = relative_coords.sum(-1)
|
|
439
|
+
return relative_position_index
|
|
440
|
+
|
|
438
441
|
|
|
439
442
|
# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->ClapAudio
|
|
440
443
|
class ClapAudioSelfOutput(nn.Module):
|
|
@@ -1266,9 +1269,9 @@ class ClapTextEncoder(nn.Module):
|
|
|
1266
1269
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
|
1267
1270
|
|
|
1268
1271
|
layer_outputs = layer_module(
|
|
1269
|
-
hidden_states
|
|
1270
|
-
attention_mask
|
|
1271
|
-
output_attentions
|
|
1272
|
+
hidden_states,
|
|
1273
|
+
attention_mask,
|
|
1274
|
+
output_attentions,
|
|
1272
1275
|
**kwargs,
|
|
1273
1276
|
)
|
|
1274
1277
|
|
|
@@ -1317,6 +1320,8 @@ class ClapPreTrainedModel(PreTrainedModel):
|
|
|
1317
1320
|
if isinstance(module, ClapTextEmbeddings):
|
|
1318
1321
|
init.normal_(module.position_embeddings.weight, mean=0.0, std=factor * 0.02)
|
|
1319
1322
|
init.normal_(module.token_type_embeddings.weight, mean=0.0, std=factor * 0.02)
|
|
1323
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
1324
|
+
init.zeros_(module.token_type_ids)
|
|
1320
1325
|
elif isinstance(module, ClapModel):
|
|
1321
1326
|
init.constant_(module.logit_scale_a, math.log(self.config.logit_scale_init_value))
|
|
1322
1327
|
init.constant_(module.logit_scale_t, math.log(self.config.logit_scale_init_value))
|
|
@@ -1325,6 +1330,10 @@ class ClapPreTrainedModel(PreTrainedModel):
|
|
|
1325
1330
|
elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
|
|
1326
1331
|
init.zeros_(module.bias)
|
|
1327
1332
|
init.ones_(module.weight)
|
|
1333
|
+
if getattr(module, "running_mean", None) is not None:
|
|
1334
|
+
init.zeros_(module.running_mean)
|
|
1335
|
+
init.ones_(module.running_var)
|
|
1336
|
+
init.zeros_(module.num_batches_tracked)
|
|
1328
1337
|
elif isinstance(module, (nn.Conv2d, nn.Linear)):
|
|
1329
1338
|
in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
|
|
1330
1339
|
init.normal_(module.weight, std=in_proj_std)
|
|
@@ -1332,6 +1341,7 @@ class ClapPreTrainedModel(PreTrainedModel):
|
|
|
1332
1341
|
init.zeros_(module.bias)
|
|
1333
1342
|
elif isinstance(module, ClapAudioSelfAttention):
|
|
1334
1343
|
init.zeros_(module.relative_position_bias_table)
|
|
1344
|
+
init.copy_(module.relative_position_index, module.create_relative_position_index())
|
|
1335
1345
|
|
|
1336
1346
|
|
|
1337
1347
|
class ClapAudioModel(ClapPreTrainedModel):
|
|
@@ -1356,6 +1366,7 @@ class ClapAudioModel(ClapPreTrainedModel):
|
|
|
1356
1366
|
output_attentions: Optional[bool] = None,
|
|
1357
1367
|
output_hidden_states: Optional[bool] = None,
|
|
1358
1368
|
return_dict: Optional[bool] = None,
|
|
1369
|
+
**kwargs,
|
|
1359
1370
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
1360
1371
|
r"""
|
|
1361
1372
|
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
|
|
@@ -1446,6 +1457,7 @@ class ClapTextModel(ClapPreTrainedModel):
|
|
|
1446
1457
|
output_attentions: Optional[bool] = None,
|
|
1447
1458
|
output_hidden_states: Optional[bool] = None,
|
|
1448
1459
|
return_dict: Optional[bool] = None,
|
|
1460
|
+
**kwargs,
|
|
1449
1461
|
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
|
1450
1462
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
1451
1463
|
output_hidden_states = (
|
|
@@ -1627,6 +1639,7 @@ class ClapModel(ClapPreTrainedModel):
|
|
|
1627
1639
|
output_attentions: Optional[bool] = None,
|
|
1628
1640
|
output_hidden_states: Optional[bool] = None,
|
|
1629
1641
|
return_dict: Optional[bool] = None,
|
|
1642
|
+
**kwargs,
|
|
1630
1643
|
) -> Union[tuple, ClapOutput]:
|
|
1631
1644
|
r"""
|
|
1632
1645
|
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
|
|
@@ -1740,6 +1753,7 @@ class ClapTextModelWithProjection(ClapPreTrainedModel):
|
|
|
1740
1753
|
output_attentions: Optional[bool] = None,
|
|
1741
1754
|
output_hidden_states: Optional[bool] = None,
|
|
1742
1755
|
return_dict: Optional[bool] = None,
|
|
1756
|
+
**kwargs,
|
|
1743
1757
|
) -> Union[tuple, ClapTextModelOutput]:
|
|
1744
1758
|
r"""
|
|
1745
1759
|
Examples:
|
|
@@ -1803,6 +1817,7 @@ class ClapAudioModelWithProjection(ClapPreTrainedModel):
|
|
|
1803
1817
|
output_attentions: Optional[bool] = None,
|
|
1804
1818
|
output_hidden_states: Optional[bool] = None,
|
|
1805
1819
|
return_dict: Optional[bool] = None,
|
|
1820
|
+
**kwargs,
|
|
1806
1821
|
) -> Union[tuple, ClapAudioModelOutput]:
|
|
1807
1822
|
r"""
|
|
1808
1823
|
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
|
|
@@ -416,11 +416,13 @@ class CLIPPreTrainedModel(PreTrainedModel):
|
|
|
416
416
|
if isinstance(module, CLIPTextEmbeddings):
|
|
417
417
|
init.normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02)
|
|
418
418
|
init.normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02)
|
|
419
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
419
420
|
elif isinstance(module, CLIPVisionEmbeddings):
|
|
420
421
|
factor = self.config.initializer_factor
|
|
421
422
|
init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
|
|
422
423
|
init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
|
|
423
424
|
init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
|
|
425
|
+
init.copy_(module.position_ids, torch.arange(module.num_positions).expand((1, -1)))
|
|
424
426
|
elif isinstance(module, CLIPAttention):
|
|
425
427
|
factor = self.config.initializer_factor
|
|
426
428
|
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
|