transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +30 -3
- transformers/cli/serve.py +47 -17
- transformers/conversion_mapping.py +15 -2
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +196 -135
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +1 -2
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +1 -2
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/configuration_utils.py +3 -2
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/continuous_api.py +134 -79
- transformers/image_processing_base.py +1 -2
- transformers/integrations/__init__.py +4 -2
- transformers/integrations/accelerate.py +15 -3
- transformers/integrations/aqlm.py +38 -66
- transformers/integrations/awq.py +48 -514
- transformers/integrations/bitnet.py +45 -100
- transformers/integrations/bitsandbytes.py +79 -191
- transformers/integrations/deepspeed.py +1 -0
- transformers/integrations/eetq.py +84 -79
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +236 -193
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +40 -62
- transformers/integrations/hub_kernels.py +42 -3
- transformers/integrations/integration_utils.py +10 -0
- transformers/integrations/mxfp4.py +25 -65
- transformers/integrations/peft.py +7 -29
- transformers/integrations/quanto.py +73 -55
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +44 -90
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +42 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +8 -0
- transformers/modeling_rope_utils.py +30 -6
- transformers/modeling_utils.py +116 -112
- transformers/models/__init__.py +3 -0
- transformers/models/afmoe/modeling_afmoe.py +4 -4
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +2 -0
- transformers/models/altclip/modeling_altclip.py +4 -0
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/modeling_aria.py +4 -4
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/auto/configuration_auto.py +11 -0
- transformers/models/auto/feature_extraction_auto.py +2 -0
- transformers/models/auto/image_processing_auto.py +1 -0
- transformers/models/auto/modeling_auto.py +6 -0
- transformers/models/auto/processing_auto.py +18 -10
- transformers/models/auto/tokenization_auto.py +74 -472
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/bamba/modeling_bamba.py +4 -3
- transformers/models/bark/modeling_bark.py +2 -0
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/big_bird/modeling_big_bird.py +6 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +11 -2
- transformers/models/bitnet/modeling_bitnet.py +4 -4
- transformers/models/blenderbot/modeling_blenderbot.py +5 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
- transformers/models/blip/modeling_blip_text.py +2 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -1
- transformers/models/bloom/modeling_bloom.py +4 -0
- transformers/models/blt/modeling_blt.py +2 -2
- transformers/models/blt/modular_blt.py +2 -2
- transformers/models/bridgetower/modeling_bridgetower.py +5 -1
- transformers/models/bros/modeling_bros.py +4 -0
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +5 -0
- transformers/models/chameleon/modeling_chameleon.py +2 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
- transformers/models/clap/modeling_clap.py +5 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +5 -0
- transformers/models/clvp/modeling_clvp.py +5 -0
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +4 -3
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +7 -6
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
- transformers/models/convbert/modeling_convbert.py +6 -0
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/modeling_csm.py +4 -3
- transformers/models/ctrl/modeling_ctrl.py +1 -0
- transformers/models/cvt/modeling_cvt.py +2 -0
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/modeling_d_fine.py +2 -0
- transformers/models/d_fine/modular_d_fine.py +1 -0
- transformers/models/dab_detr/modeling_dab_detr.py +4 -0
- transformers/models/dac/modeling_dac.py +2 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/dbrx/modeling_dbrx.py +2 -2
- transformers/models/deberta/modeling_deberta.py +5 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
- transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
- transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
- transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/modeling_detr.py +5 -0
- transformers/models/dia/modeling_dia.py +4 -3
- transformers/models/dia/modular_dia.py +0 -1
- transformers/models/diffllama/modeling_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +2 -3
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +2 -0
- transformers/models/dots1/modeling_dots1.py +10 -7
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/edgetam/modeling_edgetam.py +1 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
- transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
- transformers/models/efficientnet/modeling_efficientnet.py +2 -0
- transformers/models/emu3/modeling_emu3.py +4 -4
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +14 -2
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
- transformers/models/esm/modeling_esmfold.py +5 -4
- transformers/models/evolla/modeling_evolla.py +4 -4
- transformers/models/exaone4/modeling_exaone4.py +2 -2
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +6 -1
- transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
- transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
- transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
- transformers/models/flaubert/modeling_flaubert.py +7 -0
- transformers/models/flava/modeling_flava.py +6 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
- transformers/models/florence2/modeling_florence2.py +2 -1
- transformers/models/florence2/modular_florence2.py +2 -1
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/processing_fuyu.py +3 -3
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +2 -1
- transformers/models/gemma3/modeling_gemma3.py +14 -84
- transformers/models/gemma3/modular_gemma3.py +12 -81
- transformers/models/gemma3n/modeling_gemma3n.py +18 -209
- transformers/models/gemma3n/modular_gemma3n.py +17 -59
- transformers/models/git/modeling_git.py +2 -0
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/modeling_glm4v.py +3 -3
- transformers/models/glm4v/modular_glm4v.py +6 -4
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
- transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/gpt2/modeling_gpt2.py +5 -1
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
- transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
- transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
- transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
- transformers/models/gptj/modeling_gptj.py +3 -0
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granitemoe/modeling_granitemoe.py +4 -6
- transformers/models/granitemoe/modular_granitemoe.py +0 -2
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
- transformers/models/groupvit/modeling_groupvit.py +3 -0
- transformers/models/helium/modeling_helium.py +4 -3
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +3 -0
- transformers/models/hubert/modular_hubert.py +1 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
- transformers/models/ibert/modeling_ibert.py +6 -0
- transformers/models/idefics/modeling_idefics.py +5 -21
- transformers/models/imagegpt/modeling_imagegpt.py +2 -1
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/internvl/modeling_internvl.py +2 -4
- transformers/models/internvl/modular_internvl.py +2 -4
- transformers/models/jamba/modeling_jamba.py +2 -2
- transformers/models/janus/modeling_janus.py +1 -0
- transformers/models/janus/modular_janus.py +1 -0
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/kosmos2/modeling_kosmos2.py +1 -0
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +244 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +729 -0
- transformers/models/lasr/modular_lasr.py +569 -0
- transformers/models/lasr/processing_lasr.py +96 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +5 -0
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +3 -0
- transformers/models/lfm2/modeling_lfm2.py +4 -5
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +4 -0
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/modeling_llama4.py +3 -2
- transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
- transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -0
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +4 -0
- transformers/models/mamba/modeling_mamba.py +14 -22
- transformers/models/marian/modeling_marian.py +5 -0
- transformers/models/markuplm/modeling_markuplm.py +4 -0
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/modeling_mask2former.py +2 -0
- transformers/models/maskformer/modeling_maskformer.py +2 -0
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +3 -1
- transformers/models/minimax/modeling_minimax.py +4 -4
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +4 -3
- transformers/models/mistral/modeling_mistral.py +4 -3
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mllama/modeling_mllama.py +2 -2
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/modeling_mobilevit.py +3 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
- transformers/models/modernbert/modeling_modernbert.py +4 -1
- transformers/models/modernbert/modular_modernbert.py +2 -0
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
- transformers/models/moonshine/modeling_moonshine.py +4 -2
- transformers/models/moshi/modeling_moshi.py +5 -2
- transformers/models/mpnet/modeling_mpnet.py +5 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +6 -0
- transformers/models/mt5/modeling_mt5.py +7 -0
- transformers/models/musicgen/modeling_musicgen.py +2 -0
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nemotron/modeling_nemotron.py +4 -2
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nougat/tokenization_nougat.py +11 -59
- transformers/models/nystromformer/modeling_nystromformer.py +6 -0
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +4 -5
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
- transformers/models/oneformer/modeling_oneformer.py +4 -1
- transformers/models/openai/modeling_openai.py +3 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/owlv2/modeling_owlv2.py +4 -0
- transformers/models/owlvit/modeling_owlvit.py +4 -0
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +9 -6
- transformers/models/parakeet/modular_parakeet.py +2 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
- transformers/models/patchtst/modeling_patchtst.py +20 -2
- transformers/models/pegasus/modeling_pegasus.py +5 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
- transformers/models/perceiver/modeling_perceiver.py +8 -0
- transformers/models/persimmon/modeling_persimmon.py +2 -1
- transformers/models/phi/modeling_phi.py +4 -5
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +2 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
- transformers/models/phimoe/modeling_phimoe.py +4 -4
- transformers/models/phimoe/modular_phimoe.py +2 -2
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pixtral/modeling_pixtral.py +2 -1
- transformers/models/plbart/modeling_plbart.py +6 -0
- transformers/models/plbart/modular_plbart.py +2 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/modeling_poolformer.py +2 -0
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +3 -0
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
- transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
- transformers/models/rag/modeling_rag.py +1 -0
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
- transformers/models/reformer/modeling_reformer.py +4 -0
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +6 -1
- transformers/models/rembert/modeling_rembert.py +6 -0
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +11 -2
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/modeling_rt_detr.py +2 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
- transformers/models/rwkv/modeling_rwkv.py +1 -0
- transformers/models/sam2/modeling_sam2.py +2 -2
- transformers/models/sam2/modular_sam2.py +2 -2
- transformers/models/sam2_video/modeling_sam2_video.py +1 -0
- transformers/models/sam2_video/modular_sam2_video.py +1 -0
- transformers/models/sam3/modeling_sam3.py +77 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
- transformers/models/sam3_video/modeling_sam3_video.py +1 -0
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
- transformers/models/seed_oss/modeling_seed_oss.py +2 -2
- transformers/models/segformer/modeling_segformer.py +4 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/siglip2/modeling_siglip2.py +4 -0
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
- transformers/models/speecht5/modeling_speecht5.py +13 -1
- transformers/models/splinter/modeling_splinter.py +3 -0
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +6 -0
- transformers/models/stablelm/modeling_stablelm.py +3 -1
- transformers/models/starcoder2/modeling_starcoder2.py +4 -3
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +2 -0
- transformers/models/swin/modeling_swin.py +4 -0
- transformers/models/swin2sr/modeling_swin2sr.py +2 -0
- transformers/models/swinv2/modeling_swinv2.py +4 -0
- transformers/models/t5/modeling_t5.py +7 -0
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +5 -5
- transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
- transformers/models/table_transformer/modeling_table_transformer.py +4 -0
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +2 -0
- transformers/models/timesfm/modular_timesfm.py +2 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
- transformers/models/trocr/modeling_trocr.py +2 -0
- transformers/models/tvp/modeling_tvp.py +2 -0
- transformers/models/udop/modeling_udop.py +4 -0
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/modeling_umt5.py +7 -0
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/vilt/modeling_vilt.py +6 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +6 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/modeling_vitmatte.py +1 -0
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/modeling_whisper.py +6 -0
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +3 -0
- transformers/models/xglm/modeling_xglm.py +1 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +5 -0
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/yoso/modeling_yoso.py +6 -0
- transformers/models/zamba/modeling_zamba.py +2 -0
- transformers/models/zamba2/modeling_zamba2.py +4 -2
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/modeling_zoedepth.py +1 -0
- transformers/pipelines/__init__.py +2 -3
- transformers/pipelines/base.py +1 -9
- transformers/pipelines/document_question_answering.py +3 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/processing_utils.py +23 -11
- transformers/quantizers/base.py +35 -110
- transformers/quantizers/quantizer_aqlm.py +1 -5
- transformers/quantizers/quantizer_auto_round.py +1 -2
- transformers/quantizers/quantizer_awq.py +17 -81
- transformers/quantizers/quantizer_bitnet.py +3 -8
- transformers/quantizers/quantizer_bnb_4bit.py +13 -110
- transformers/quantizers/quantizer_bnb_8bit.py +16 -92
- transformers/quantizers/quantizer_compressed_tensors.py +1 -5
- transformers/quantizers/quantizer_eetq.py +14 -62
- transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
- transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
- transformers/quantizers/quantizer_fp_quant.py +48 -78
- transformers/quantizers/quantizer_gptq.py +7 -24
- transformers/quantizers/quantizer_higgs.py +40 -54
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +13 -167
- transformers/quantizers/quantizer_quanto.py +20 -64
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +1 -4
- transformers/quantizers/quantizer_torchao.py +23 -202
- transformers/quantizers/quantizer_vptq.py +8 -22
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +297 -36
- transformers/tokenization_mistral_common.py +4 -0
- transformers/tokenization_utils_base.py +113 -222
- transformers/tokenization_utils_tokenizers.py +168 -107
- transformers/trainer.py +28 -31
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +66 -28
- transformers/utils/__init__.py +3 -4
- transformers/utils/auto_docstring.py +1 -0
- transformers/utils/generic.py +27 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +61 -16
- transformers/utils/kernel_config.py +4 -2
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +75 -242
- transformers/video_processing_utils.py +1 -2
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -773,6 +773,7 @@ class SEWModel(SEWPreTrainedModel):
|
|
|
773
773
|
output_attentions: Optional[bool] = None,
|
|
774
774
|
output_hidden_states: Optional[bool] = None,
|
|
775
775
|
return_dict: Optional[bool] = None,
|
|
776
|
+
**kwargs,
|
|
776
777
|
) -> Union[tuple, BaseModelOutput]:
|
|
777
778
|
r"""
|
|
778
779
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -902,6 +903,7 @@ class SEWForCTC(SEWPreTrainedModel):
|
|
|
902
903
|
output_hidden_states: Optional[bool] = None,
|
|
903
904
|
return_dict: Optional[bool] = None,
|
|
904
905
|
labels: Optional[torch.Tensor] = None,
|
|
906
|
+
**kwargs,
|
|
905
907
|
) -> Union[tuple, CausalLMOutput]:
|
|
906
908
|
r"""
|
|
907
909
|
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
|
@@ -1013,6 +1015,7 @@ class SEWForSequenceClassification(SEWPreTrainedModel):
|
|
|
1013
1015
|
output_hidden_states: Optional[bool] = None,
|
|
1014
1016
|
return_dict: Optional[bool] = None,
|
|
1015
1017
|
labels: Optional[torch.Tensor] = None,
|
|
1018
|
+
**kwargs,
|
|
1016
1019
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1017
1020
|
r"""
|
|
1018
1021
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -392,6 +392,7 @@ class SEWModel(SEWPreTrainedModel):
|
|
|
392
392
|
output_attentions: Optional[bool] = None,
|
|
393
393
|
output_hidden_states: Optional[bool] = None,
|
|
394
394
|
return_dict: Optional[bool] = None,
|
|
395
|
+
**kwargs,
|
|
395
396
|
) -> Union[tuple, BaseModelOutput]:
|
|
396
397
|
r"""
|
|
397
398
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1318,6 +1318,7 @@ class SEWDModel(SEWDPreTrainedModel):
|
|
|
1318
1318
|
output_attentions: Optional[bool] = None,
|
|
1319
1319
|
output_hidden_states: Optional[bool] = None,
|
|
1320
1320
|
return_dict: Optional[bool] = None,
|
|
1321
|
+
**kwargs,
|
|
1321
1322
|
) -> Union[tuple, BaseModelOutput]:
|
|
1322
1323
|
r"""
|
|
1323
1324
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1445,6 +1446,7 @@ class SEWDForCTC(SEWDPreTrainedModel):
|
|
|
1445
1446
|
output_hidden_states: Optional[bool] = None,
|
|
1446
1447
|
return_dict: Optional[bool] = None,
|
|
1447
1448
|
labels: Optional[torch.Tensor] = None,
|
|
1449
|
+
**kwargs,
|
|
1448
1450
|
) -> Union[tuple, CausalLMOutput]:
|
|
1449
1451
|
r"""
|
|
1450
1452
|
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
|
@@ -1557,6 +1559,7 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel):
|
|
|
1557
1559
|
output_hidden_states: Optional[bool] = None,
|
|
1558
1560
|
return_dict: Optional[bool] = None,
|
|
1559
1561
|
labels: Optional[torch.Tensor] = None,
|
|
1562
|
+
**kwargs,
|
|
1560
1563
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1561
1564
|
r"""
|
|
1562
1565
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -510,6 +510,7 @@ class Siglip2VisionTransformer(Siglip2PreTrainedModel):
|
|
|
510
510
|
spatial_shapes: torch.LongTensor,
|
|
511
511
|
output_attentions: Optional[bool] = None,
|
|
512
512
|
output_hidden_states: Optional[bool] = None,
|
|
513
|
+
**kwargs,
|
|
513
514
|
) -> BaseModelOutputWithPooling:
|
|
514
515
|
r"""
|
|
515
516
|
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
|
|
@@ -760,6 +761,7 @@ class Siglip2VisionModel(Siglip2PreTrainedModel):
|
|
|
760
761
|
spatial_shapes: torch.LongTensor,
|
|
761
762
|
output_attentions: Optional[bool] = None,
|
|
762
763
|
output_hidden_states: Optional[bool] = None,
|
|
764
|
+
**kwargs,
|
|
763
765
|
) -> BaseModelOutputWithPooling:
|
|
764
766
|
r"""
|
|
765
767
|
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
|
|
@@ -927,6 +929,7 @@ class Siglip2Model(Siglip2PreTrainedModel):
|
|
|
927
929
|
return_loss: Optional[bool] = None,
|
|
928
930
|
output_attentions: Optional[bool] = None,
|
|
929
931
|
output_hidden_states: Optional[bool] = None,
|
|
932
|
+
**kwargs,
|
|
930
933
|
) -> Siglip2Output:
|
|
931
934
|
r"""
|
|
932
935
|
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
|
|
@@ -1058,6 +1061,7 @@ class Siglip2ForImageClassification(Siglip2PreTrainedModel):
|
|
|
1058
1061
|
labels: Optional[torch.Tensor] = None,
|
|
1059
1062
|
output_attentions: Optional[bool] = None,
|
|
1060
1063
|
output_hidden_states: Optional[bool] = None,
|
|
1064
|
+
**kwargs,
|
|
1061
1065
|
) -> ImageClassifierOutput:
|
|
1062
1066
|
r"""
|
|
1063
1067
|
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
|
|
@@ -247,6 +247,7 @@ class Siglip2VisionTransformer(SiglipVisionTransformer):
|
|
|
247
247
|
spatial_shapes: torch.LongTensor,
|
|
248
248
|
output_attentions: Optional[bool] = None,
|
|
249
249
|
output_hidden_states: Optional[bool] = None,
|
|
250
|
+
**kwargs,
|
|
250
251
|
) -> BaseModelOutputWithPooling:
|
|
251
252
|
r"""
|
|
252
253
|
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
|
|
@@ -324,6 +325,7 @@ class Siglip2VisionModel(SiglipVisionModel):
|
|
|
324
325
|
spatial_shapes: torch.LongTensor,
|
|
325
326
|
output_attentions: Optional[bool] = None,
|
|
326
327
|
output_hidden_states: Optional[bool] = None,
|
|
328
|
+
**kwargs,
|
|
327
329
|
) -> BaseModelOutputWithPooling:
|
|
328
330
|
r"""
|
|
329
331
|
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
|
|
@@ -419,6 +421,7 @@ class Siglip2Model(SiglipModel):
|
|
|
419
421
|
return_loss: Optional[bool] = None,
|
|
420
422
|
output_attentions: Optional[bool] = None,
|
|
421
423
|
output_hidden_states: Optional[bool] = None,
|
|
424
|
+
**kwargs,
|
|
422
425
|
) -> Siglip2Output:
|
|
423
426
|
r"""
|
|
424
427
|
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
|
|
@@ -522,6 +525,7 @@ class Siglip2ForImageClassification(SiglipForImageClassification):
|
|
|
522
525
|
labels: Optional[torch.Tensor] = None,
|
|
523
526
|
output_attentions: Optional[bool] = None,
|
|
524
527
|
output_hidden_states: Optional[bool] = None,
|
|
528
|
+
**kwargs,
|
|
525
529
|
) -> ImageClassifierOutput:
|
|
526
530
|
r"""
|
|
527
531
|
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
|
|
@@ -28,7 +28,7 @@ from torch import nn
|
|
|
28
28
|
from ...activations import ACT2FN
|
|
29
29
|
from ...cache_utils import Cache, DynamicCache
|
|
30
30
|
from ...generation import GenerationMixin
|
|
31
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
|
|
31
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
32
32
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
33
33
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
34
34
|
from ...modeling_layers import (
|
|
@@ -42,7 +42,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
42
42
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
43
43
|
from ...processing_utils import Unpack
|
|
44
44
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
45
|
-
from ...utils.generic import check_model_inputs
|
|
45
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
46
46
|
from .configuration_smollm3 import SmolLM3Config
|
|
47
47
|
|
|
48
48
|
|
|
@@ -102,7 +102,7 @@ class SmolLM3RotaryEmbedding(nn.Module):
|
|
|
102
102
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
103
103
|
|
|
104
104
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
105
|
-
with
|
|
105
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
106
106
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
107
107
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
108
108
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -184,6 +184,7 @@ def eager_attention_forward(
|
|
|
184
184
|
return attn_output, attn_weights
|
|
185
185
|
|
|
186
186
|
|
|
187
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
187
188
|
class SmolLM3Attention(nn.Module):
|
|
188
189
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
189
190
|
|
|
@@ -209,7 +210,6 @@ class SmolLM3Attention(nn.Module):
|
|
|
209
210
|
self.o_proj = nn.Linear(
|
|
210
211
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
211
212
|
)
|
|
212
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
213
213
|
|
|
214
214
|
self.use_rope = config.no_rope_layers[layer_idx]
|
|
215
215
|
self.sliding_window = (
|
|
@@ -27,13 +27,6 @@ from ...utils import is_num2words_available, is_vision_available, logging
|
|
|
27
27
|
from ...video_utils import VideoInput
|
|
28
28
|
|
|
29
29
|
|
|
30
|
-
if is_vision_available():
|
|
31
|
-
from .video_processing_smolvlm import (
|
|
32
|
-
DEFAULT_MEDIA_OUTTRO,
|
|
33
|
-
DEFAULT_VIDEO_INTRO,
|
|
34
|
-
FRAME_TIMESTAMP_MESSAGE,
|
|
35
|
-
)
|
|
36
|
-
|
|
37
30
|
if is_vision_available():
|
|
38
31
|
from .video_processing_smolvlm import (
|
|
39
32
|
DEFAULT_MEDIA_OUTTRO,
|
|
@@ -567,6 +567,7 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
|
|
|
567
567
|
output_attentions=None,
|
|
568
568
|
output_hidden_states=None,
|
|
569
569
|
return_dict=None,
|
|
570
|
+
**kwargs,
|
|
570
571
|
):
|
|
571
572
|
r"""
|
|
572
573
|
Args:
|
|
@@ -707,6 +708,7 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
|
|
|
707
708
|
output_hidden_states=None,
|
|
708
709
|
return_dict=None,
|
|
709
710
|
cache_position=None,
|
|
711
|
+
**kwargs,
|
|
710
712
|
):
|
|
711
713
|
r"""
|
|
712
714
|
Args:
|
|
@@ -899,6 +901,7 @@ class Speech2TextModel(Speech2TextPreTrainedModel):
|
|
|
899
901
|
output_hidden_states: Optional[bool] = None,
|
|
900
902
|
return_dict: Optional[bool] = None,
|
|
901
903
|
cache_position: Optional[torch.Tensor] = None,
|
|
904
|
+
**kwargs,
|
|
902
905
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
|
|
903
906
|
r"""
|
|
904
907
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1035,6 +1038,7 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel, Generation
|
|
|
1035
1038
|
output_hidden_states: Optional[bool] = None,
|
|
1036
1039
|
return_dict: Optional[bool] = None,
|
|
1037
1040
|
cache_position: Optional[torch.Tensor] = None,
|
|
1041
|
+
**kwargs,
|
|
1038
1042
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
|
|
1039
1043
|
r"""
|
|
1040
1044
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1239,6 +1239,7 @@ class SpeechT5Encoder(SpeechT5PreTrainedModel):
|
|
|
1239
1239
|
output_attentions: Optional[bool] = None,
|
|
1240
1240
|
output_hidden_states: Optional[bool] = None,
|
|
1241
1241
|
return_dict: Optional[bool] = None,
|
|
1242
|
+
**kwargs,
|
|
1242
1243
|
) -> Union[tuple, BaseModelOutput]:
|
|
1243
1244
|
"""
|
|
1244
1245
|
Args:
|
|
@@ -1342,6 +1343,7 @@ class SpeechT5EncoderWithSpeechPrenet(SpeechT5PreTrainedModel):
|
|
|
1342
1343
|
output_attentions: Optional[bool] = None,
|
|
1343
1344
|
output_hidden_states: Optional[bool] = None,
|
|
1344
1345
|
return_dict: Optional[bool] = None,
|
|
1346
|
+
**kwargs,
|
|
1345
1347
|
) -> Union[tuple, BaseModelOutput]:
|
|
1346
1348
|
hidden_states, attention_mask = self.prenet(input_values, attention_mask)
|
|
1347
1349
|
|
|
@@ -1382,6 +1384,7 @@ class SpeechT5EncoderWithTextPrenet(SpeechT5PreTrainedModel):
|
|
|
1382
1384
|
output_attentions: Optional[bool] = None,
|
|
1383
1385
|
output_hidden_states: Optional[bool] = None,
|
|
1384
1386
|
return_dict: Optional[bool] = None,
|
|
1387
|
+
**kwargs,
|
|
1385
1388
|
) -> Union[tuple, BaseModelOutput]:
|
|
1386
1389
|
hidden_states = self.prenet(input_values)
|
|
1387
1390
|
|
|
@@ -1416,6 +1419,7 @@ class SpeechT5EncoderWithoutPrenet(SpeechT5PreTrainedModel):
|
|
|
1416
1419
|
output_attentions: Optional[bool] = None,
|
|
1417
1420
|
output_hidden_states: Optional[bool] = None,
|
|
1418
1421
|
return_dict: Optional[bool] = None,
|
|
1422
|
+
**kwargs,
|
|
1419
1423
|
) -> Union[tuple, BaseModelOutput]:
|
|
1420
1424
|
return self.wrapped_encoder(
|
|
1421
1425
|
hidden_states=input_values,
|
|
@@ -1454,6 +1458,7 @@ class SpeechT5Decoder(SpeechT5PreTrainedModel):
|
|
|
1454
1458
|
output_hidden_states: Optional[bool] = None,
|
|
1455
1459
|
return_dict: Optional[bool] = None,
|
|
1456
1460
|
cache_position: Optional[torch.Tensor] = None,
|
|
1461
|
+
**kwargs,
|
|
1457
1462
|
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
|
1458
1463
|
r"""
|
|
1459
1464
|
Args:
|
|
@@ -1613,6 +1618,7 @@ class SpeechT5DecoderWithSpeechPrenet(SpeechT5PreTrainedModel):
|
|
|
1613
1618
|
output_hidden_states: Optional[bool] = None,
|
|
1614
1619
|
return_dict: Optional[bool] = None,
|
|
1615
1620
|
cache_position: Optional[torch.Tensor] = None,
|
|
1621
|
+
**kwargs,
|
|
1616
1622
|
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
|
1617
1623
|
decoder_hidden_states = self.prenet(input_values, speaker_embeddings)
|
|
1618
1624
|
|
|
@@ -1663,6 +1669,7 @@ class SpeechT5DecoderWithTextPrenet(SpeechT5PreTrainedModel):
|
|
|
1663
1669
|
output_hidden_states: Optional[bool] = None,
|
|
1664
1670
|
return_dict: Optional[bool] = None,
|
|
1665
1671
|
cache_position: Optional[torch.Tensor] = None,
|
|
1672
|
+
**kwargs,
|
|
1666
1673
|
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
|
1667
1674
|
decoder_hidden_states, attention_mask = self.prenet(input_values, attention_mask, past_key_values)
|
|
1668
1675
|
|
|
@@ -1707,6 +1714,7 @@ class SpeechT5DecoderWithoutPrenet(SpeechT5PreTrainedModel):
|
|
|
1707
1714
|
output_hidden_states: Optional[bool] = None,
|
|
1708
1715
|
return_dict: Optional[bool] = None,
|
|
1709
1716
|
cache_position: Optional[torch.Tensor] = None,
|
|
1717
|
+
**kwargs,
|
|
1710
1718
|
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
|
1711
1719
|
outputs = self.wrapped_decoder(
|
|
1712
1720
|
hidden_states=input_values,
|
|
@@ -1905,6 +1913,7 @@ class SpeechT5Model(SpeechT5PreTrainedModel):
|
|
|
1905
1913
|
output_hidden_states: Optional[bool] = None,
|
|
1906
1914
|
return_dict: Optional[bool] = None,
|
|
1907
1915
|
cache_position: Optional[torch.Tensor] = None,
|
|
1916
|
+
**kwargs,
|
|
1908
1917
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
|
|
1909
1918
|
r"""
|
|
1910
1919
|
input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -2046,6 +2055,7 @@ class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel, GenerationMixin):
|
|
|
2046
2055
|
return_dict: Optional[bool] = None,
|
|
2047
2056
|
labels: Optional[torch.LongTensor] = None,
|
|
2048
2057
|
cache_position: Optional[torch.Tensor] = None,
|
|
2058
|
+
**kwargs,
|
|
2049
2059
|
) -> Union[tuple, Seq2SeqLMOutput]:
|
|
2050
2060
|
r"""
|
|
2051
2061
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -2356,6 +2366,7 @@ class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
|
|
|
2356
2366
|
labels: Optional[torch.FloatTensor] = None,
|
|
2357
2367
|
stop_labels: Optional[torch.Tensor] = None,
|
|
2358
2368
|
cache_position: Optional[torch.Tensor] = None,
|
|
2369
|
+
**kwargs,
|
|
2359
2370
|
) -> Union[tuple, Seq2SeqSpectrogramOutput]:
|
|
2360
2371
|
r"""
|
|
2361
2372
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -2694,6 +2705,7 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
|
|
|
2694
2705
|
labels: Optional[torch.FloatTensor] = None,
|
|
2695
2706
|
stop_labels: Optional[torch.Tensor] = None,
|
|
2696
2707
|
cache_position: Optional[torch.Tensor] = None,
|
|
2708
|
+
**kwargs,
|
|
2697
2709
|
) -> Union[tuple, Seq2SeqSpectrogramOutput]:
|
|
2698
2710
|
r"""
|
|
2699
2711
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -3023,7 +3035,7 @@ class SpeechT5HifiGan(PreTrainedModel):
|
|
|
3023
3035
|
waveform.
|
|
3024
3036
|
"""
|
|
3025
3037
|
)
|
|
3026
|
-
def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
|
|
3038
|
+
def forward(self, spectrogram: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
|
|
3027
3039
|
r"""
|
|
3028
3040
|
spectrogram (`torch.FloatTensor`):
|
|
3029
3041
|
Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
|
|
@@ -368,6 +368,7 @@ class SplinterModel(SplinterPreTrainedModel):
|
|
|
368
368
|
output_attentions: Optional[bool] = None,
|
|
369
369
|
output_hidden_states: Optional[bool] = None,
|
|
370
370
|
return_dict: Optional[bool] = None,
|
|
371
|
+
**kwargs,
|
|
371
372
|
) -> Union[tuple, BaseModelOutput]:
|
|
372
373
|
r"""
|
|
373
374
|
token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
|
|
@@ -516,6 +517,7 @@ class SplinterForQuestionAnswering(SplinterPreTrainedModel):
|
|
|
516
517
|
output_hidden_states: Optional[bool] = None,
|
|
517
518
|
return_dict: Optional[bool] = None,
|
|
518
519
|
question_positions: Optional[torch.LongTensor] = None,
|
|
520
|
+
**kwargs,
|
|
519
521
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
520
522
|
r"""
|
|
521
523
|
token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
|
|
@@ -658,6 +660,7 @@ class SplinterForPreTraining(SplinterPreTrainedModel):
|
|
|
658
660
|
output_hidden_states: Optional[bool] = None,
|
|
659
661
|
return_dict: Optional[bool] = None,
|
|
660
662
|
question_positions: Optional[torch.LongTensor] = None,
|
|
663
|
+
**kwargs,
|
|
661
664
|
) -> Union[tuple, SplinterForPreTrainingOutput]:
|
|
662
665
|
r"""
|
|
663
666
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_questions, sequence_length)`):
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
"""Tokenization classes for Splinter."""
|
|
17
17
|
|
|
18
18
|
import collections
|
|
19
|
-
from typing import Optional
|
|
19
|
+
from typing import Optional, Union
|
|
20
20
|
|
|
21
21
|
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
|
|
22
22
|
from tokenizers.models import WordPiece
|
|
@@ -72,16 +72,17 @@ class SplinterTokenizer(TokenizersBackend):
|
|
|
72
72
|
strip_accents (`bool`, *optional*):
|
|
73
73
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
|
74
74
|
value for `lowercase`.
|
|
75
|
-
vocab (`dict`, *optional*):
|
|
75
|
+
vocab (`str`, `dict` or `list`, *optional*):
|
|
76
76
|
Custom vocabulary dictionary. If not provided, a minimal vocabulary is created.
|
|
77
77
|
"""
|
|
78
78
|
|
|
79
79
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
80
80
|
model_input_names = ["input_ids", "attention_mask"]
|
|
81
|
-
|
|
81
|
+
model = WordPiece
|
|
82
82
|
|
|
83
83
|
def __init__(
|
|
84
84
|
self,
|
|
85
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
85
86
|
do_lower_case: bool = True,
|
|
86
87
|
unk_token: str = "[UNK]",
|
|
87
88
|
sep_token: str = "[SEP]",
|
|
@@ -91,15 +92,12 @@ class SplinterTokenizer(TokenizersBackend):
|
|
|
91
92
|
question_token: str = "[QUESTION]",
|
|
92
93
|
tokenize_chinese_chars: bool = True,
|
|
93
94
|
strip_accents: Optional[bool] = None,
|
|
94
|
-
vocab: Optional[dict] = None,
|
|
95
95
|
**kwargs,
|
|
96
96
|
):
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
else:
|
|
102
|
-
self._vocab = {
|
|
97
|
+
self._vocab = (
|
|
98
|
+
vocab
|
|
99
|
+
if vocab is not None
|
|
100
|
+
else {
|
|
103
101
|
str(pad_token): 0,
|
|
104
102
|
str(unk_token): 1,
|
|
105
103
|
str(cls_token): 2,
|
|
@@ -108,6 +106,7 @@ class SplinterTokenizer(TokenizersBackend):
|
|
|
108
106
|
str(question_token): 5,
|
|
109
107
|
".": 6,
|
|
110
108
|
}
|
|
109
|
+
)
|
|
111
110
|
|
|
112
111
|
self._tokenizer = Tokenizer(WordPiece(self._vocab, unk_token=str(unk_token)))
|
|
113
112
|
|
|
@@ -120,10 +119,7 @@ class SplinterTokenizer(TokenizersBackend):
|
|
|
120
119
|
self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
|
121
120
|
self._tokenizer.decoder = decoders.WordPiece(prefix="##")
|
|
122
121
|
|
|
123
|
-
tokenizer_object = self._tokenizer
|
|
124
|
-
|
|
125
122
|
super().__init__(
|
|
126
|
-
tokenizer_object=tokenizer_object,
|
|
127
123
|
unk_token=unk_token,
|
|
128
124
|
sep_token=sep_token,
|
|
129
125
|
pad_token=pad_token,
|
|
@@ -136,21 +132,6 @@ class SplinterTokenizer(TokenizersBackend):
|
|
|
136
132
|
**kwargs,
|
|
137
133
|
)
|
|
138
134
|
|
|
139
|
-
if hasattr(self, "_tokenizer") and self._tokenizer.normalizer is not None:
|
|
140
|
-
import json
|
|
141
|
-
|
|
142
|
-
pre_tok_state = json.loads(self._tokenizer.normalizer.__getstate__())
|
|
143
|
-
if (
|
|
144
|
-
pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
|
|
145
|
-
or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
|
|
146
|
-
or pre_tok_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
|
|
147
|
-
):
|
|
148
|
-
pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
|
|
149
|
-
pre_tok_state["lowercase"] = do_lower_case
|
|
150
|
-
pre_tok_state["strip_accents"] = strip_accents
|
|
151
|
-
pre_tok_state["handle_chinese_chars"] = tokenize_chinese_chars
|
|
152
|
-
self._tokenizer.normalizer = pre_tok_class(**pre_tok_state)
|
|
153
|
-
|
|
154
135
|
self.do_lower_case = do_lower_case
|
|
155
136
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
|
156
137
|
self.strip_accents = strip_accents
|
|
@@ -443,6 +443,7 @@ class SqueezeBertModel(SqueezeBertPreTrainedModel):
|
|
|
443
443
|
output_attentions: Optional[bool] = None,
|
|
444
444
|
output_hidden_states: Optional[bool] = None,
|
|
445
445
|
return_dict: Optional[bool] = None,
|
|
446
|
+
**kwargs,
|
|
446
447
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
447
448
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
448
449
|
output_hidden_states = (
|
|
@@ -528,6 +529,7 @@ class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel):
|
|
|
528
529
|
output_attentions: Optional[bool] = None,
|
|
529
530
|
output_hidden_states: Optional[bool] = None,
|
|
530
531
|
return_dict: Optional[bool] = None,
|
|
532
|
+
**kwargs,
|
|
531
533
|
) -> Union[tuple, MaskedLMOutput]:
|
|
532
534
|
r"""
|
|
533
535
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -599,6 +601,7 @@ class SqueezeBertForSequenceClassification(SqueezeBertPreTrainedModel):
|
|
|
599
601
|
output_attentions: Optional[bool] = None,
|
|
600
602
|
output_hidden_states: Optional[bool] = None,
|
|
601
603
|
return_dict: Optional[bool] = None,
|
|
604
|
+
**kwargs,
|
|
602
605
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
603
606
|
r"""
|
|
604
607
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -683,6 +686,7 @@ class SqueezeBertForMultipleChoice(SqueezeBertPreTrainedModel):
|
|
|
683
686
|
output_attentions: Optional[bool] = None,
|
|
684
687
|
output_hidden_states: Optional[bool] = None,
|
|
685
688
|
return_dict: Optional[bool] = None,
|
|
689
|
+
**kwargs,
|
|
686
690
|
) -> Union[tuple, MultipleChoiceModelOutput]:
|
|
687
691
|
r"""
|
|
688
692
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -786,6 +790,7 @@ class SqueezeBertForTokenClassification(SqueezeBertPreTrainedModel):
|
|
|
786
790
|
output_attentions: Optional[bool] = None,
|
|
787
791
|
output_hidden_states: Optional[bool] = None,
|
|
788
792
|
return_dict: Optional[bool] = None,
|
|
793
|
+
**kwargs,
|
|
789
794
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
790
795
|
r"""
|
|
791
796
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -851,6 +856,7 @@ class SqueezeBertForQuestionAnswering(SqueezeBertPreTrainedModel):
|
|
|
851
856
|
output_attentions: Optional[bool] = None,
|
|
852
857
|
output_hidden_states: Optional[bool] = None,
|
|
853
858
|
return_dict: Optional[bool] = None,
|
|
859
|
+
**kwargs,
|
|
854
860
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
855
861
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
856
862
|
|
|
@@ -45,6 +45,7 @@ from ...modeling_rope_utils import (
|
|
|
45
45
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
46
46
|
from ...processing_utils import Unpack
|
|
47
47
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
|
|
48
|
+
from ...utils.generic import maybe_autocast
|
|
48
49
|
from .configuration_stablelm import StableLmConfig
|
|
49
50
|
|
|
50
51
|
|
|
@@ -117,7 +118,7 @@ class StableLmRotaryEmbedding(nn.Module):
|
|
|
117
118
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
118
119
|
|
|
119
120
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
120
|
-
with
|
|
121
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
121
122
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
122
123
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
123
124
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -492,6 +493,7 @@ class StableLmModel(StableLmPreTrainedModel):
|
|
|
492
493
|
output_attentions: Optional[bool] = None,
|
|
493
494
|
output_hidden_states: Optional[bool] = None,
|
|
494
495
|
cache_position: Optional[torch.LongTensor] = None,
|
|
496
|
+
**kwargs,
|
|
495
497
|
) -> BaseModelOutputWithPast:
|
|
496
498
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
497
499
|
output_hidden_states = (
|
|
@@ -35,7 +35,7 @@ from transformers.utils.generic import check_model_inputs
|
|
|
35
35
|
from ...activations import ACT2FN
|
|
36
36
|
from ...cache_utils import Cache, DynamicCache
|
|
37
37
|
from ...generation import GenerationMixin
|
|
38
|
-
from ...integrations import use_kernel_func_from_hub
|
|
38
|
+
from ...integrations import use_kernel_func_from_hub, use_kernelized_func
|
|
39
39
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
40
40
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
41
41
|
from ...modeling_layers import (
|
|
@@ -48,6 +48,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
48
48
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
49
49
|
from ...processing_utils import Unpack
|
|
50
50
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
51
|
+
from ...utils.generic import maybe_autocast
|
|
51
52
|
from .configuration_starcoder2 import Starcoder2Config
|
|
52
53
|
|
|
53
54
|
|
|
@@ -141,6 +142,7 @@ def eager_attention_forward(
|
|
|
141
142
|
return attn_output, attn_weights
|
|
142
143
|
|
|
143
144
|
|
|
145
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
144
146
|
class Starcoder2Attention(nn.Module):
|
|
145
147
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
146
148
|
|
|
@@ -157,7 +159,6 @@ class Starcoder2Attention(nn.Module):
|
|
|
157
159
|
self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
|
|
158
160
|
self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
|
|
159
161
|
self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias)
|
|
160
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
161
162
|
self.residual_dropout = config.residual_dropout
|
|
162
163
|
|
|
163
164
|
def forward(
|
|
@@ -327,7 +328,7 @@ class Starcoder2RotaryEmbedding(nn.Module):
|
|
|
327
328
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
328
329
|
|
|
329
330
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
330
|
-
with
|
|
331
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
331
332
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
332
333
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
333
334
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -670,6 +670,7 @@ class SuperGlueForKeypointMatching(SuperGluePreTrainedModel):
|
|
|
670
670
|
output_attentions: Optional[bool] = None,
|
|
671
671
|
output_hidden_states: Optional[bool] = None,
|
|
672
672
|
return_dict: Optional[bool] = None,
|
|
673
|
+
**kwargs,
|
|
673
674
|
) -> Union[tuple, SuperGlueKeypointMatchingOutput]:
|
|
674
675
|
r"""
|
|
675
676
|
Examples:
|
|
@@ -378,6 +378,7 @@ class SuperPointForKeypointDetection(SuperPointPreTrainedModel):
|
|
|
378
378
|
labels: Optional[torch.LongTensor] = None,
|
|
379
379
|
output_hidden_states: Optional[bool] = None,
|
|
380
380
|
return_dict: Optional[bool] = None,
|
|
381
|
+
**kwargs,
|
|
381
382
|
) -> Union[tuple, SuperPointKeypointDescriptionOutput]:
|
|
382
383
|
r"""
|
|
383
384
|
Examples:
|
|
@@ -428,6 +428,7 @@ class SwiftFormerModel(SwiftFormerPreTrainedModel):
|
|
|
428
428
|
pixel_values: Optional[torch.Tensor] = None,
|
|
429
429
|
output_hidden_states: Optional[bool] = None,
|
|
430
430
|
return_dict: Optional[bool] = None,
|
|
431
|
+
**kwargs,
|
|
431
432
|
) -> Union[tuple, BaseModelOutputWithNoAttention]:
|
|
432
433
|
output_hidden_states = (
|
|
433
434
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
@@ -478,6 +479,7 @@ class SwiftFormerForImageClassification(SwiftFormerPreTrainedModel):
|
|
|
478
479
|
labels: Optional[torch.Tensor] = None,
|
|
479
480
|
output_hidden_states: Optional[bool] = None,
|
|
480
481
|
return_dict: Optional[bool] = None,
|
|
482
|
+
**kwargs,
|
|
481
483
|
) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
|
|
482
484
|
r"""
|
|
483
485
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -860,6 +860,7 @@ class SwinModel(SwinPreTrainedModel):
|
|
|
860
860
|
output_hidden_states: Optional[bool] = None,
|
|
861
861
|
interpolate_pos_encoding: bool = False,
|
|
862
862
|
return_dict: Optional[bool] = None,
|
|
863
|
+
**kwargs,
|
|
863
864
|
) -> Union[tuple, SwinModelOutput]:
|
|
864
865
|
r"""
|
|
865
866
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
|
|
@@ -946,6 +947,7 @@ class SwinForMaskedImageModeling(SwinPreTrainedModel):
|
|
|
946
947
|
output_hidden_states: Optional[bool] = None,
|
|
947
948
|
interpolate_pos_encoding: bool = False,
|
|
948
949
|
return_dict: Optional[bool] = None,
|
|
950
|
+
**kwargs,
|
|
949
951
|
) -> Union[tuple, SwinMaskedImageModelingOutput]:
|
|
950
952
|
r"""
|
|
951
953
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
|
|
@@ -1059,6 +1061,7 @@ class SwinForImageClassification(SwinPreTrainedModel):
|
|
|
1059
1061
|
output_hidden_states: Optional[bool] = None,
|
|
1060
1062
|
interpolate_pos_encoding: bool = False,
|
|
1061
1063
|
return_dict: Optional[bool] = None,
|
|
1064
|
+
**kwargs,
|
|
1062
1065
|
) -> Union[tuple, SwinImageClassifierOutput]:
|
|
1063
1066
|
r"""
|
|
1064
1067
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1129,6 +1132,7 @@ class SwinBackbone(SwinPreTrainedModel, BackboneMixin):
|
|
|
1129
1132
|
output_hidden_states: Optional[bool] = None,
|
|
1130
1133
|
output_attentions: Optional[bool] = None,
|
|
1131
1134
|
return_dict: Optional[bool] = None,
|
|
1135
|
+
**kwargs,
|
|
1132
1136
|
) -> BackboneOutput:
|
|
1133
1137
|
"""
|
|
1134
1138
|
Returns:
|
|
@@ -754,6 +754,7 @@ class Swin2SRModel(Swin2SRPreTrainedModel):
|
|
|
754
754
|
output_attentions: Optional[bool] = None,
|
|
755
755
|
output_hidden_states: Optional[bool] = None,
|
|
756
756
|
return_dict: Optional[bool] = None,
|
|
757
|
+
**kwargs,
|
|
757
758
|
) -> Union[tuple, BaseModelOutput]:
|
|
758
759
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
759
760
|
output_hidden_states = (
|
|
@@ -972,6 +973,7 @@ class Swin2SRForImageSuperResolution(Swin2SRPreTrainedModel):
|
|
|
972
973
|
output_attentions: Optional[bool] = None,
|
|
973
974
|
output_hidden_states: Optional[bool] = None,
|
|
974
975
|
return_dict: Optional[bool] = None,
|
|
976
|
+
**kwargs,
|
|
975
977
|
) -> Union[tuple, ImageSuperResolutionOutput]:
|
|
976
978
|
r"""
|
|
977
979
|
Example:
|
|
@@ -942,6 +942,7 @@ class Swinv2Model(Swinv2PreTrainedModel):
|
|
|
942
942
|
output_hidden_states: Optional[bool] = None,
|
|
943
943
|
interpolate_pos_encoding: bool = False,
|
|
944
944
|
return_dict: Optional[bool] = None,
|
|
945
|
+
**kwargs,
|
|
945
946
|
) -> Union[tuple, Swinv2ModelOutput]:
|
|
946
947
|
r"""
|
|
947
948
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
|
|
@@ -1030,6 +1031,7 @@ class Swinv2ForMaskedImageModeling(Swinv2PreTrainedModel):
|
|
|
1030
1031
|
output_hidden_states: Optional[bool] = None,
|
|
1031
1032
|
interpolate_pos_encoding: bool = False,
|
|
1032
1033
|
return_dict: Optional[bool] = None,
|
|
1034
|
+
**kwargs,
|
|
1033
1035
|
) -> Union[tuple, Swinv2MaskedImageModelingOutput]:
|
|
1034
1036
|
r"""
|
|
1035
1037
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
|
|
@@ -1144,6 +1146,7 @@ class Swinv2ForImageClassification(Swinv2PreTrainedModel):
|
|
|
1144
1146
|
output_hidden_states: Optional[bool] = None,
|
|
1145
1147
|
interpolate_pos_encoding: bool = False,
|
|
1146
1148
|
return_dict: Optional[bool] = None,
|
|
1149
|
+
**kwargs,
|
|
1147
1150
|
) -> Union[tuple, Swinv2ImageClassifierOutput]:
|
|
1148
1151
|
r"""
|
|
1149
1152
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1209,6 +1212,7 @@ class Swinv2Backbone(Swinv2PreTrainedModel, BackboneMixin):
|
|
|
1209
1212
|
output_attentions: Optional[bool] = None,
|
|
1210
1213
|
output_hidden_states: Optional[bool] = None,
|
|
1211
1214
|
return_dict: Optional[bool] = None,
|
|
1215
|
+
**kwargs,
|
|
1212
1216
|
) -> BackboneOutput:
|
|
1213
1217
|
r"""
|
|
1214
1218
|
Examples:
|