transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -1521,6 +1521,9 @@ class BigBirdPreTrainedModel(PreTrainedModel):
|
|
|
1521
1521
|
super()._init_weights(module)
|
|
1522
1522
|
if isinstance(module, BigBirdLMPredictionHead):
|
|
1523
1523
|
init.zeros_(module.bias)
|
|
1524
|
+
elif isinstance(module, BigBirdEmbeddings):
|
|
1525
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
1526
|
+
init.zeros_(module.token_type_ids)
|
|
1524
1527
|
|
|
1525
1528
|
|
|
1526
1529
|
@dataclass
|
|
@@ -1918,6 +1921,7 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
|
|
|
1918
1921
|
output_attentions: Optional[bool] = None,
|
|
1919
1922
|
output_hidden_states: Optional[bool] = None,
|
|
1920
1923
|
return_dict: Optional[bool] = None,
|
|
1924
|
+
**kwargs,
|
|
1921
1925
|
) -> Union[BigBirdForPreTrainingOutput, tuple[torch.FloatTensor]]:
|
|
1922
1926
|
r"""
|
|
1923
1927
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -2028,6 +2032,7 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
|
|
|
2028
2032
|
output_attentions: Optional[bool] = None,
|
|
2029
2033
|
output_hidden_states: Optional[bool] = None,
|
|
2030
2034
|
return_dict: Optional[bool] = None,
|
|
2035
|
+
**kwargs,
|
|
2031
2036
|
) -> Union[MaskedLMOutput, tuple[torch.FloatTensor]]:
|
|
2032
2037
|
r"""
|
|
2033
2038
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -2277,6 +2282,7 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
|
|
|
2277
2282
|
output_attentions: Optional[bool] = None,
|
|
2278
2283
|
output_hidden_states: Optional[bool] = None,
|
|
2279
2284
|
return_dict: Optional[bool] = None,
|
|
2285
|
+
**kwargs,
|
|
2280
2286
|
) -> Union[SequenceClassifierOutput, tuple[torch.FloatTensor]]:
|
|
2281
2287
|
r"""
|
|
2282
2288
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -2394,6 +2400,7 @@ class BigBirdForMultipleChoice(BigBirdPreTrainedModel):
|
|
|
2394
2400
|
output_attentions: Optional[bool] = None,
|
|
2395
2401
|
output_hidden_states: Optional[bool] = None,
|
|
2396
2402
|
return_dict: Optional[bool] = None,
|
|
2403
|
+
**kwargs,
|
|
2397
2404
|
) -> Union[MultipleChoiceModelOutput, tuple[torch.FloatTensor]]:
|
|
2398
2405
|
r"""
|
|
2399
2406
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -2500,6 +2507,7 @@ class BigBirdForTokenClassification(BigBirdPreTrainedModel):
|
|
|
2500
2507
|
output_attentions: Optional[bool] = None,
|
|
2501
2508
|
output_hidden_states: Optional[bool] = None,
|
|
2502
2509
|
return_dict: Optional[bool] = None,
|
|
2510
|
+
**kwargs,
|
|
2503
2511
|
) -> Union[TokenClassifierOutput, tuple[torch.FloatTensor]]:
|
|
2504
2512
|
r"""
|
|
2505
2513
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -2591,6 +2599,7 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
|
|
|
2591
2599
|
output_attentions: Optional[bool] = None,
|
|
2592
2600
|
output_hidden_states: Optional[bool] = None,
|
|
2593
2601
|
return_dict: Optional[bool] = None,
|
|
2602
|
+
**kwargs,
|
|
2594
2603
|
) -> Union[BigBirdForQuestionAnsweringModelOutput, tuple[torch.FloatTensor]]:
|
|
2595
2604
|
r"""
|
|
2596
2605
|
question_lengths (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
|
|
@@ -14,6 +14,8 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""Tokenization classes for Big Bird model."""
|
|
16
16
|
|
|
17
|
+
from typing import Optional, Union
|
|
18
|
+
|
|
17
19
|
from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
|
|
18
20
|
from tokenizers.models import Unigram
|
|
19
21
|
|
|
@@ -37,7 +39,7 @@ class BigBirdTokenizer(TokenizersBackend):
|
|
|
37
39
|
this superclass for more information regarding those methods
|
|
38
40
|
|
|
39
41
|
Args:
|
|
40
|
-
vocab (`dict`, *optional*):
|
|
42
|
+
vocab (`str`, `dict` or `list`, *optional*):
|
|
41
43
|
Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
|
|
42
44
|
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
|
43
45
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
|
@@ -80,10 +82,11 @@ class BigBirdTokenizer(TokenizersBackend):
|
|
|
80
82
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
81
83
|
model_input_names = ["input_ids", "attention_mask"]
|
|
82
84
|
prefix_tokens: list[int] = []
|
|
85
|
+
model = Unigram
|
|
83
86
|
|
|
84
87
|
def __init__(
|
|
85
88
|
self,
|
|
86
|
-
vocab=None,
|
|
89
|
+
vocab: Optional[Union[str, dict, list]] = None,
|
|
87
90
|
unk_token="<unk>",
|
|
88
91
|
bos_token="<s>",
|
|
89
92
|
eos_token="</s>",
|
|
@@ -92,8 +95,6 @@ class BigBirdTokenizer(TokenizersBackend):
|
|
|
92
95
|
mask_token="[MASK]",
|
|
93
96
|
cls_token="[CLS]",
|
|
94
97
|
add_prefix_space=True,
|
|
95
|
-
vocab_file=None,
|
|
96
|
-
tokenizer_file=None,
|
|
97
98
|
**kwargs,
|
|
98
99
|
):
|
|
99
100
|
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
|
@@ -105,47 +106,18 @@ class BigBirdTokenizer(TokenizersBackend):
|
|
|
105
106
|
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
|
106
107
|
|
|
107
108
|
self.add_prefix_space = add_prefix_space
|
|
108
|
-
self.vocab_file = vocab_file
|
|
109
109
|
|
|
110
110
|
# Convert vocab to list of (token, score) tuples
|
|
111
111
|
if vocab is None:
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
vocab_scores = [(str(token), float(score)) for token, score in vocab]
|
|
118
|
-
else:
|
|
119
|
-
vocab_scores = [(str(token), 0.0) for token in vocab]
|
|
120
|
-
else:
|
|
121
|
-
vocab_scores = [(str(pad_token), 0.0), (str(eos_token), 0.0), (str(bos_token), 0.0)]
|
|
122
|
-
|
|
123
|
-
# Find unk_id in vocab
|
|
124
|
-
unk_token_content = str(unk_token)
|
|
125
|
-
unk_id = next((idx for idx, (token, _) in enumerate(vocab_scores) if token == unk_token_content), None)
|
|
126
|
-
if unk_id is None:
|
|
127
|
-
unk_id = min(len(vocab_scores), 100)
|
|
128
|
-
if len(vocab_scores) > 100:
|
|
129
|
-
vocab_scores.insert(100, (unk_token_content, 0.0))
|
|
130
|
-
else:
|
|
131
|
-
vocab_scores.append((unk_token_content, 0.0))
|
|
132
|
-
|
|
133
|
-
# Ensure cls_token and sep_token are in vocab
|
|
134
|
-
cls_token_str = str(cls_token)
|
|
135
|
-
sep_token_str = str(sep_token)
|
|
136
|
-
cls_token_id = next((idx for idx, (token, _) in enumerate(vocab_scores) if token == cls_token_str), None)
|
|
137
|
-
sep_token_id = next((idx for idx, (token, _) in enumerate(vocab_scores) if token == sep_token_str), None)
|
|
112
|
+
vocab = [(str(pad_token), 0.0), (str(eos_token), 0.0), (str(bos_token), 0.0), (str(unk_token), 0.0)]
|
|
113
|
+
unk_id = 3
|
|
114
|
+
elif isinstance(vocab, list):
|
|
115
|
+
# vocab.insert(100, (str(unk_token), 0.0)) # Ensure unk_token is in vocab at index 100
|
|
116
|
+
unk_id = vocab.index((str(unk_token), 0.0)) if (str(unk_token), 0.0) in vocab else 100
|
|
138
117
|
|
|
139
|
-
|
|
140
|
-
cls_token_id = len(vocab_scores)
|
|
141
|
-
vocab_scores.append((cls_token_str, 0.0))
|
|
142
|
-
if sep_token_id is None:
|
|
143
|
-
sep_token_id = len(vocab_scores)
|
|
144
|
-
vocab_scores.append((sep_token_str, 0.0))
|
|
145
|
-
|
|
146
|
-
self._tokenizer = Tokenizer(Unigram(vocab_scores, unk_id=unk_id, byte_fallback=False))
|
|
118
|
+
self._tokenizer = Tokenizer(Unigram(vocab, unk_id=unk_id, byte_fallback=False))
|
|
147
119
|
self._tokenizer.normalizer = normalizers.Sequence(
|
|
148
|
-
[normalizers.Strip(left=False, right=
|
|
120
|
+
[normalizers.Strip(left=False, right=False), normalizers.Replace(Regex(r" {2,}"), SPIECE_UNDERLINE)]
|
|
149
121
|
)
|
|
150
122
|
|
|
151
123
|
prepend_scheme = "always" if add_prefix_space else "never"
|
|
@@ -155,7 +127,6 @@ class BigBirdTokenizer(TokenizersBackend):
|
|
|
155
127
|
self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme, split=True)
|
|
156
128
|
|
|
157
129
|
super().__init__(
|
|
158
|
-
tokenizer_object=self._tokenizer,
|
|
159
130
|
bos_token=bos_token,
|
|
160
131
|
eos_token=eos_token,
|
|
161
132
|
unk_token=unk_token,
|
|
@@ -163,10 +134,15 @@ class BigBirdTokenizer(TokenizersBackend):
|
|
|
163
134
|
mask_token=mask_token,
|
|
164
135
|
cls_token=cls_token,
|
|
165
136
|
sep_token=sep_token,
|
|
137
|
+
add_prefix_space=add_prefix_space,
|
|
166
138
|
**kwargs,
|
|
167
139
|
)
|
|
168
140
|
|
|
169
|
-
|
|
141
|
+
# Ensure cls_token and sep_token are in vocab
|
|
142
|
+
cls_token_str = str(cls_token)
|
|
143
|
+
sep_token_str = str(sep_token)
|
|
144
|
+
cls_token_id = self.cls_token_id
|
|
145
|
+
sep_token_id = self.sep_token_id
|
|
170
146
|
|
|
171
147
|
self._tokenizer.post_processor = processors.TemplateProcessing(
|
|
172
148
|
single=f"{cls_token_str}:0 $A:0 {sep_token_str}:0",
|
|
@@ -23,6 +23,7 @@ import torch
|
|
|
23
23
|
from torch import nn
|
|
24
24
|
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
25
25
|
|
|
26
|
+
from ... import initialization as init
|
|
26
27
|
from ...activations import ACT2FN
|
|
27
28
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
28
29
|
from ...generation import GenerationMixin
|
|
@@ -1154,7 +1155,6 @@ class BigBirdPegasusEncoderAttention(nn.Module):
|
|
|
1154
1155
|
return outputs
|
|
1155
1156
|
|
|
1156
1157
|
|
|
1157
|
-
# Copied from transformers.models.bert.modeling_bert.eager_attention_forward
|
|
1158
1158
|
def eager_attention_forward(
|
|
1159
1159
|
module: nn.Module,
|
|
1160
1160
|
query: torch.Tensor,
|
|
@@ -1178,7 +1178,7 @@ def eager_attention_forward(
|
|
|
1178
1178
|
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
|
1179
1179
|
attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
|
|
1180
1180
|
|
|
1181
|
-
attn_output = torch.matmul(attn_weights, value)
|
|
1181
|
+
attn_output = torch.matmul(attn_weights.to(value.dtype), value)
|
|
1182
1182
|
attn_output = attn_output.transpose(1, 2).contiguous()
|
|
1183
1183
|
|
|
1184
1184
|
return attn_output, attn_weights
|
|
@@ -1537,6 +1537,11 @@ class BigBirdPegasusPreTrainedModel(PreTrainedModel):
|
|
|
1537
1537
|
_skip_keys_device_placement = "past_key_values"
|
|
1538
1538
|
_can_compile_fullgraph = True
|
|
1539
1539
|
|
|
1540
|
+
def _init_weights(self, module):
|
|
1541
|
+
super()._init_weights(module)
|
|
1542
|
+
if isinstance(module, BigBirdPegasusForConditionalGeneration):
|
|
1543
|
+
init.zeros_(module.final_logits_bias)
|
|
1544
|
+
|
|
1540
1545
|
@property
|
|
1541
1546
|
def dummy_inputs(self):
|
|
1542
1547
|
pad_token = self.config.pad_token_id
|
|
@@ -1595,6 +1600,7 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
|
|
|
1595
1600
|
output_attentions: Optional[bool] = None,
|
|
1596
1601
|
output_hidden_states: Optional[bool] = None,
|
|
1597
1602
|
return_dict: Optional[bool] = None,
|
|
1603
|
+
**kwargs,
|
|
1598
1604
|
):
|
|
1599
1605
|
r"""
|
|
1600
1606
|
Args:
|
|
@@ -1868,6 +1874,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
|
|
|
1868
1874
|
output_hidden_states: Optional[bool] = None,
|
|
1869
1875
|
return_dict: Optional[bool] = None,
|
|
1870
1876
|
cache_position: Optional[torch.Tensor] = None,
|
|
1877
|
+
**kwargs,
|
|
1871
1878
|
):
|
|
1872
1879
|
r"""
|
|
1873
1880
|
Args:
|
|
@@ -2097,6 +2104,7 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
|
|
|
2097
2104
|
output_hidden_states: Optional[bool] = None,
|
|
2098
2105
|
return_dict: Optional[bool] = None,
|
|
2099
2106
|
cache_position: Optional[torch.LongTensor] = None,
|
|
2107
|
+
**kwargs,
|
|
2100
2108
|
) -> Union[tuple, Seq2SeqModelOutput]:
|
|
2101
2109
|
r"""
|
|
2102
2110
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -2235,6 +2243,7 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, Gene
|
|
|
2235
2243
|
output_hidden_states: Optional[bool] = None,
|
|
2236
2244
|
return_dict: Optional[bool] = None,
|
|
2237
2245
|
cache_position: Optional[torch.LongTensor] = None,
|
|
2246
|
+
**kwargs,
|
|
2238
2247
|
) -> Union[tuple, Seq2SeqLMOutput]:
|
|
2239
2248
|
r"""
|
|
2240
2249
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -2369,6 +2378,7 @@ class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
|
|
|
2369
2378
|
output_hidden_states: Optional[bool] = None,
|
|
2370
2379
|
return_dict: Optional[bool] = None,
|
|
2371
2380
|
cache_position: Optional[torch.LongTensor] = None,
|
|
2381
|
+
**kwargs,
|
|
2372
2382
|
) -> Union[tuple, Seq2SeqSequenceClassifierOutput]:
|
|
2373
2383
|
r"""
|
|
2374
2384
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -2490,6 +2500,7 @@ class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
|
|
|
2490
2500
|
output_hidden_states: Optional[bool] = None,
|
|
2491
2501
|
return_dict: Optional[bool] = None,
|
|
2492
2502
|
cache_position: Optional[torch.LongTensor] = None,
|
|
2503
|
+
**kwargs,
|
|
2493
2504
|
) -> Union[tuple, Seq2SeqQuestionAnsweringModelOutput]:
|
|
2494
2505
|
r"""
|
|
2495
2506
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -2577,6 +2588,7 @@ class BigBirdPegasusDecoderWrapper(BigBirdPegasusPreTrainedModel):
|
|
|
2577
2588
|
def __init__(self, config):
|
|
2578
2589
|
super().__init__(config)
|
|
2579
2590
|
self.decoder = BigBirdPegasusDecoder(config)
|
|
2591
|
+
self.post_init()
|
|
2580
2592
|
|
|
2581
2593
|
def forward(self, *args, **kwargs):
|
|
2582
2594
|
return self.decoder(*args, **kwargs)
|
|
@@ -2616,6 +2628,7 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel, GenerationMixin):
|
|
|
2616
2628
|
return_dict: Optional[bool] = None,
|
|
2617
2629
|
cache_position: Optional[torch.LongTensor] = None,
|
|
2618
2630
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
2631
|
+
**kwargs,
|
|
2619
2632
|
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
|
|
2620
2633
|
r"""
|
|
2621
2634
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -620,6 +620,7 @@ class BioGptForTokenClassification(BioGptPreTrainedModel):
|
|
|
620
620
|
output_hidden_states: Optional[bool] = None,
|
|
621
621
|
return_dict: Optional[bool] = None,
|
|
622
622
|
cache_position: Optional[torch.Tensor] = None,
|
|
623
|
+
**kwargs,
|
|
623
624
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
624
625
|
r"""
|
|
625
626
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -711,6 +712,7 @@ class BioGptForSequenceClassification(BioGptPreTrainedModel):
|
|
|
711
712
|
return_dict: Optional[bool] = None,
|
|
712
713
|
cache_position: Optional[torch.Tensor] = None,
|
|
713
714
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
715
|
+
**kwargs,
|
|
714
716
|
) -> Union[tuple, SequenceClassifierOutputWithPast]:
|
|
715
717
|
r"""
|
|
716
718
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -442,6 +442,7 @@ class BioGptForTokenClassification(BioGptPreTrainedModel):
|
|
|
442
442
|
output_hidden_states: Optional[bool] = None,
|
|
443
443
|
return_dict: Optional[bool] = None,
|
|
444
444
|
cache_position: Optional[torch.Tensor] = None,
|
|
445
|
+
**kwargs,
|
|
445
446
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
446
447
|
r"""
|
|
447
448
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -533,6 +534,7 @@ class BioGptForSequenceClassification(BioGptPreTrainedModel):
|
|
|
533
534
|
return_dict: Optional[bool] = None,
|
|
534
535
|
cache_position: Optional[torch.Tensor] = None,
|
|
535
536
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
537
|
+
**kwargs,
|
|
536
538
|
) -> Union[tuple, SequenceClassifierOutputWithPast]:
|
|
537
539
|
r"""
|
|
538
540
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -84,7 +84,7 @@ class WeightStandardizedConv2d(nn.Conv2d):
|
|
|
84
84
|
"""Conv2d with Weight Standardization. Used for ViT Hybrid model.
|
|
85
85
|
|
|
86
86
|
Paper: [Micro-Batch Training with Batch-Channel Normalization and Weight
|
|
87
|
-
Standardization](https://huggingface.co/papers/1903.
|
|
87
|
+
Standardization](https://huggingface.co/papers/1903.10520)
|
|
88
88
|
"""
|
|
89
89
|
|
|
90
90
|
def __init__(
|
|
@@ -643,6 +643,10 @@ class BitPreTrainedModel(PreTrainedModel):
|
|
|
643
643
|
elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
|
|
644
644
|
init.constant_(module.weight, 1)
|
|
645
645
|
init.constant_(module.bias, 0)
|
|
646
|
+
if getattr(module, "running_mean", None) is not None:
|
|
647
|
+
init.zeros_(module.running_mean)
|
|
648
|
+
init.ones_(module.running_var)
|
|
649
|
+
init.zeros_(module.num_batches_tracked)
|
|
646
650
|
|
|
647
651
|
|
|
648
652
|
@auto_docstring
|
|
@@ -666,7 +670,11 @@ class BitModel(BitPreTrainedModel):
|
|
|
666
670
|
|
|
667
671
|
@auto_docstring
|
|
668
672
|
def forward(
|
|
669
|
-
self,
|
|
673
|
+
self,
|
|
674
|
+
pixel_values: Tensor,
|
|
675
|
+
output_hidden_states: Optional[bool] = None,
|
|
676
|
+
return_dict: Optional[bool] = None,
|
|
677
|
+
**kwargs,
|
|
670
678
|
) -> BaseModelOutputWithPoolingAndNoAttention:
|
|
671
679
|
output_hidden_states = (
|
|
672
680
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
@@ -721,6 +729,7 @@ class BitForImageClassification(BitPreTrainedModel):
|
|
|
721
729
|
labels: Optional[torch.LongTensor] = None,
|
|
722
730
|
output_hidden_states: Optional[bool] = None,
|
|
723
731
|
return_dict: Optional[bool] = None,
|
|
732
|
+
**kwargs,
|
|
724
733
|
) -> ImageClassifierOutputWithNoAttention:
|
|
725
734
|
r"""
|
|
726
735
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -767,7 +776,11 @@ class BitBackbone(BitPreTrainedModel, BackboneMixin):
|
|
|
767
776
|
|
|
768
777
|
@auto_docstring
|
|
769
778
|
def forward(
|
|
770
|
-
self,
|
|
779
|
+
self,
|
|
780
|
+
pixel_values: Tensor,
|
|
781
|
+
output_hidden_states: Optional[bool] = None,
|
|
782
|
+
return_dict: Optional[bool] = None,
|
|
783
|
+
**kwargs,
|
|
771
784
|
) -> BackboneOutput:
|
|
772
785
|
r"""
|
|
773
786
|
Examples:
|
|
@@ -27,7 +27,7 @@ from torch import nn
|
|
|
27
27
|
from ...activations import ACT2FN
|
|
28
28
|
from ...cache_utils import Cache, DynamicCache
|
|
29
29
|
from ...generation import GenerationMixin
|
|
30
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
|
|
30
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
31
31
|
from ...masking_utils import create_causal_mask
|
|
32
32
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
33
33
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
@@ -36,7 +36,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
36
36
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
37
37
|
from ...processing_utils import Unpack
|
|
38
38
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
39
|
-
from ...utils.generic import check_model_inputs
|
|
39
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
40
40
|
from .configuration_bitnet import BitNetConfig
|
|
41
41
|
|
|
42
42
|
|
|
@@ -151,6 +151,7 @@ def eager_attention_forward(
|
|
|
151
151
|
return attn_output, attn_weights
|
|
152
152
|
|
|
153
153
|
|
|
154
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
154
155
|
class BitNetAttention(nn.Module):
|
|
155
156
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
156
157
|
|
|
@@ -176,7 +177,6 @@ class BitNetAttention(nn.Module):
|
|
|
176
177
|
self.o_proj = nn.Linear(
|
|
177
178
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
178
179
|
)
|
|
179
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
180
180
|
self.attn_sub_norm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
|
181
181
|
|
|
182
182
|
def forward(
|
|
@@ -287,7 +287,7 @@ class BitNetRotaryEmbedding(nn.Module):
|
|
|
287
287
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
288
288
|
|
|
289
289
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
290
|
-
self.original_inv_freq =
|
|
290
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
291
291
|
|
|
292
292
|
@staticmethod
|
|
293
293
|
def compute_default_rope_parameters(
|
|
@@ -326,7 +326,7 @@ class BitNetRotaryEmbedding(nn.Module):
|
|
|
326
326
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
327
327
|
|
|
328
328
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
329
|
-
with
|
|
329
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
330
330
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
331
331
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
332
332
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -24,6 +24,7 @@ import torch
|
|
|
24
24
|
from torch import nn
|
|
25
25
|
from torch.nn import CrossEntropyLoss
|
|
26
26
|
|
|
27
|
+
from ... import initialization as init
|
|
27
28
|
from ...activations import ACT2FN
|
|
28
29
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
29
30
|
from ...generation import GenerationMixin
|
|
@@ -437,6 +438,11 @@ class BlenderbotPreTrainedModel(PreTrainedModel):
|
|
|
437
438
|
_supports_flex_attn = True
|
|
438
439
|
_can_compile_fullgraph = True
|
|
439
440
|
|
|
441
|
+
def _init_weights(self, module):
|
|
442
|
+
super()._init_weights(module)
|
|
443
|
+
if isinstance(module, BlenderbotForConditionalGeneration):
|
|
444
|
+
init.zeros_(module.final_logits_bias)
|
|
445
|
+
|
|
440
446
|
@property
|
|
441
447
|
def dummy_inputs(self):
|
|
442
448
|
pad_token = self.config.pad_token_id
|
|
@@ -493,6 +499,7 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel):
|
|
|
493
499
|
output_attentions=None,
|
|
494
500
|
output_hidden_states=None,
|
|
495
501
|
return_dict=None,
|
|
502
|
+
**kwargs,
|
|
496
503
|
):
|
|
497
504
|
r"""
|
|
498
505
|
Args:
|
|
@@ -643,6 +650,7 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
|
|
|
643
650
|
output_hidden_states=None,
|
|
644
651
|
return_dict=None,
|
|
645
652
|
cache_position: Optional[torch.Tensor] = None,
|
|
653
|
+
**kwargs,
|
|
646
654
|
):
|
|
647
655
|
r"""
|
|
648
656
|
Args:
|
|
@@ -885,6 +893,7 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
|
|
|
885
893
|
output_hidden_states: Optional[bool] = None,
|
|
886
894
|
return_dict: Optional[bool] = None,
|
|
887
895
|
cache_position: Optional[torch.Tensor] = None,
|
|
896
|
+
**kwargs,
|
|
888
897
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
|
|
889
898
|
r"""
|
|
890
899
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1039,6 +1048,7 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel, GenerationMi
|
|
|
1039
1048
|
output_hidden_states: Optional[bool] = None,
|
|
1040
1049
|
return_dict: Optional[bool] = None,
|
|
1041
1050
|
cache_position: Optional[torch.Tensor] = None,
|
|
1051
|
+
**kwargs,
|
|
1042
1052
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
|
|
1043
1053
|
r"""
|
|
1044
1054
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1152,6 +1162,7 @@ class BlenderbotDecoderWrapper(BlenderbotPreTrainedModel):
|
|
|
1152
1162
|
def __init__(self, config):
|
|
1153
1163
|
super().__init__(config)
|
|
1154
1164
|
self.decoder = BlenderbotDecoder(config)
|
|
1165
|
+
self.post_init()
|
|
1155
1166
|
|
|
1156
1167
|
def forward(self, *args, **kwargs):
|
|
1157
1168
|
return self.decoder(*args, **kwargs)
|
|
@@ -1196,6 +1207,7 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel, GenerationMixin):
|
|
|
1196
1207
|
return_dict: Optional[bool] = None,
|
|
1197
1208
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1198
1209
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
1210
|
+
**kwargs,
|
|
1199
1211
|
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
|
|
1200
1212
|
r"""
|
|
1201
1213
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -102,14 +102,15 @@ class BlenderbotTokenizer(TokenizersBackend):
|
|
|
102
102
|
add_prefix_space (`bool`, *optional*, defaults to `True`):
|
|
103
103
|
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
|
|
104
104
|
other word. (Blenderbot tokenizer detect beginning of words by the preceding space).
|
|
105
|
-
vocab (`dict`, *optional*):
|
|
106
|
-
Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file
|
|
107
|
-
merges (`list`, *optional*):
|
|
108
|
-
Custom merges list. If not provided, merges are loaded from merges_file
|
|
105
|
+
vocab (`str` or `dict[str, int]`, *optional*):
|
|
106
|
+
Custom vocabulary dictionary. If not provided, vocabulary is loaded from `vocab_file`.
|
|
107
|
+
merges (`str` or `list[str]`, *optional*):
|
|
108
|
+
Custom merges list. If not provided, merges are loaded from `merges_file`.
|
|
109
109
|
"""
|
|
110
110
|
|
|
111
111
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
112
112
|
model_input_names = ["input_ids", "attention_mask"]
|
|
113
|
+
model = BPE
|
|
113
114
|
|
|
114
115
|
def __init__(
|
|
115
116
|
self,
|
|
@@ -132,22 +133,20 @@ class BlenderbotTokenizer(TokenizersBackend):
|
|
|
132
133
|
else mask_token
|
|
133
134
|
)
|
|
134
135
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
else:
|
|
141
|
-
# Initialize with minimal vocab
|
|
142
|
-
self._vocab = {
|
|
136
|
+
# Initialize vocab and merges; when not provided fall back to minimal vocab
|
|
137
|
+
self._vocab = (
|
|
138
|
+
vocab
|
|
139
|
+
if vocab is not None
|
|
140
|
+
else {
|
|
143
141
|
str(bos_token): 0,
|
|
144
142
|
str(pad_token): 1,
|
|
145
143
|
str(eos_token): 2,
|
|
146
144
|
str(unk_token): 3,
|
|
147
145
|
str(mask_token): 4,
|
|
148
146
|
}
|
|
149
|
-
|
|
147
|
+
)
|
|
150
148
|
|
|
149
|
+
self._merges = merges or []
|
|
151
150
|
self._tokenizer = Tokenizer(
|
|
152
151
|
BPE(
|
|
153
152
|
vocab=self._vocab,
|
|
@@ -161,17 +160,7 @@ class BlenderbotTokenizer(TokenizersBackend):
|
|
|
161
160
|
|
|
162
161
|
self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
|
|
163
162
|
self._tokenizer.decoder = decoders.ByteLevel()
|
|
164
|
-
self._tokenizer.post_processor = processors.RobertaProcessing(
|
|
165
|
-
sep=(str(eos_token), self._vocab.get(str(eos_token), 2)),
|
|
166
|
-
cls=(str(bos_token), self._vocab.get(str(bos_token), 0)),
|
|
167
|
-
add_prefix_space=add_prefix_space,
|
|
168
|
-
trim_offsets=True,
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
tokenizer_object = self._tokenizer
|
|
172
|
-
|
|
173
163
|
super().__init__(
|
|
174
|
-
tokenizer_object=tokenizer_object,
|
|
175
164
|
bos_token=bos_token,
|
|
176
165
|
eos_token=eos_token,
|
|
177
166
|
sep_token=sep_token,
|
|
@@ -182,6 +171,12 @@ class BlenderbotTokenizer(TokenizersBackend):
|
|
|
182
171
|
add_prefix_space=add_prefix_space,
|
|
183
172
|
**kwargs,
|
|
184
173
|
)
|
|
174
|
+
self._tokenizer.post_processor = processors.RobertaProcessing(
|
|
175
|
+
sep=(str(eos_token), self.eos_token_id),
|
|
176
|
+
cls=(str(bos_token), self.bos_token_id),
|
|
177
|
+
add_prefix_space=add_prefix_space,
|
|
178
|
+
trim_offsets=True,
|
|
179
|
+
)
|
|
185
180
|
|
|
186
181
|
|
|
187
182
|
__all__ = ["BlenderbotTokenizer"]
|
|
@@ -22,6 +22,7 @@ import torch
|
|
|
22
22
|
from torch import nn
|
|
23
23
|
from torch.nn import CrossEntropyLoss
|
|
24
24
|
|
|
25
|
+
from ... import initialization as init
|
|
25
26
|
from ...activations import ACT2FN
|
|
26
27
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
27
28
|
from ...generation import GenerationMixin
|
|
@@ -430,6 +431,11 @@ class BlenderbotSmallPreTrainedModel(PreTrainedModel):
|
|
|
430
431
|
_supports_flex_attn = True
|
|
431
432
|
_can_compile_fullgraph = True
|
|
432
433
|
|
|
434
|
+
def _init_weights(self, module):
|
|
435
|
+
super()._init_weights(module)
|
|
436
|
+
if isinstance(module, BlenderbotSmallForConditionalGeneration):
|
|
437
|
+
init.zeros_(module.final_logits_bias)
|
|
438
|
+
|
|
433
439
|
@property
|
|
434
440
|
def dummy_inputs(self):
|
|
435
441
|
pad_token = self.config.pad_token_id
|
|
@@ -484,6 +490,7 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
|
|
|
484
490
|
output_attentions=None,
|
|
485
491
|
output_hidden_states=None,
|
|
486
492
|
return_dict=None,
|
|
493
|
+
**kwargs,
|
|
487
494
|
):
|
|
488
495
|
r"""
|
|
489
496
|
Args:
|
|
@@ -630,6 +637,7 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
|
|
|
630
637
|
output_hidden_states=None,
|
|
631
638
|
return_dict=None,
|
|
632
639
|
cache_position=None,
|
|
640
|
+
**kwargs,
|
|
633
641
|
):
|
|
634
642
|
r"""
|
|
635
643
|
Args:
|
|
@@ -858,6 +866,7 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
|
|
|
858
866
|
output_hidden_states: Optional[bool] = None,
|
|
859
867
|
return_dict: Optional[bool] = None,
|
|
860
868
|
cache_position: Optional[torch.Tensor] = None,
|
|
869
|
+
**kwargs,
|
|
861
870
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
|
|
862
871
|
r"""
|
|
863
872
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -999,6 +1008,7 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel, Ge
|
|
|
999
1008
|
output_hidden_states: Optional[bool] = None,
|
|
1000
1009
|
return_dict: Optional[bool] = None,
|
|
1001
1010
|
cache_position: Optional[torch.Tensor] = None,
|
|
1011
|
+
**kwargs,
|
|
1002
1012
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
|
|
1003
1013
|
r"""
|
|
1004
1014
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1112,6 +1122,7 @@ class BlenderbotSmallDecoderWrapper(BlenderbotSmallPreTrainedModel):
|
|
|
1112
1122
|
def __init__(self, config):
|
|
1113
1123
|
super().__init__(config)
|
|
1114
1124
|
self.decoder = BlenderbotSmallDecoder(config)
|
|
1125
|
+
self.post_init()
|
|
1115
1126
|
|
|
1116
1127
|
def forward(self, *args, **kwargs):
|
|
1117
1128
|
return self.decoder(*args, **kwargs)
|
|
@@ -1156,6 +1167,7 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel, GenerationMixin
|
|
|
1156
1167
|
return_dict: Optional[bool] = None,
|
|
1157
1168
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1158
1169
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
1170
|
+
**kwargs,
|
|
1159
1171
|
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
|
|
1160
1172
|
r"""
|
|
1161
1173
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -430,6 +430,8 @@ class BlipPreTrainedModel(PreTrainedModel):
|
|
|
430
430
|
std = self.config.vision_config.initializer_range
|
|
431
431
|
init.trunc_normal_(module.position_embedding, mean=0.0, std=std)
|
|
432
432
|
init.trunc_normal_(module.class_embedding, mean=0.0, std=std)
|
|
433
|
+
elif isinstance(module, BlipTextEmbeddings):
|
|
434
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
433
435
|
|
|
434
436
|
|
|
435
437
|
class BlipEncoder(nn.Module):
|