transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +30 -3
- transformers/cli/serve.py +47 -17
- transformers/conversion_mapping.py +15 -2
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +196 -135
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +1 -2
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +1 -2
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/configuration_utils.py +3 -2
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/continuous_api.py +134 -79
- transformers/image_processing_base.py +1 -2
- transformers/integrations/__init__.py +4 -2
- transformers/integrations/accelerate.py +15 -3
- transformers/integrations/aqlm.py +38 -66
- transformers/integrations/awq.py +48 -514
- transformers/integrations/bitnet.py +45 -100
- transformers/integrations/bitsandbytes.py +79 -191
- transformers/integrations/deepspeed.py +1 -0
- transformers/integrations/eetq.py +84 -79
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +236 -193
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +40 -62
- transformers/integrations/hub_kernels.py +42 -3
- transformers/integrations/integration_utils.py +10 -0
- transformers/integrations/mxfp4.py +25 -65
- transformers/integrations/peft.py +7 -29
- transformers/integrations/quanto.py +73 -55
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +44 -90
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +42 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +8 -0
- transformers/modeling_rope_utils.py +30 -6
- transformers/modeling_utils.py +116 -112
- transformers/models/__init__.py +3 -0
- transformers/models/afmoe/modeling_afmoe.py +4 -4
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +2 -0
- transformers/models/altclip/modeling_altclip.py +4 -0
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/modeling_aria.py +4 -4
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/auto/configuration_auto.py +11 -0
- transformers/models/auto/feature_extraction_auto.py +2 -0
- transformers/models/auto/image_processing_auto.py +1 -0
- transformers/models/auto/modeling_auto.py +6 -0
- transformers/models/auto/processing_auto.py +18 -10
- transformers/models/auto/tokenization_auto.py +74 -472
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/bamba/modeling_bamba.py +4 -3
- transformers/models/bark/modeling_bark.py +2 -0
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/big_bird/modeling_big_bird.py +6 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +11 -2
- transformers/models/bitnet/modeling_bitnet.py +4 -4
- transformers/models/blenderbot/modeling_blenderbot.py +5 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
- transformers/models/blip/modeling_blip_text.py +2 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -1
- transformers/models/bloom/modeling_bloom.py +4 -0
- transformers/models/blt/modeling_blt.py +2 -2
- transformers/models/blt/modular_blt.py +2 -2
- transformers/models/bridgetower/modeling_bridgetower.py +5 -1
- transformers/models/bros/modeling_bros.py +4 -0
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +5 -0
- transformers/models/chameleon/modeling_chameleon.py +2 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
- transformers/models/clap/modeling_clap.py +5 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +5 -0
- transformers/models/clvp/modeling_clvp.py +5 -0
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +4 -3
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +7 -6
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
- transformers/models/convbert/modeling_convbert.py +6 -0
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/modeling_csm.py +4 -3
- transformers/models/ctrl/modeling_ctrl.py +1 -0
- transformers/models/cvt/modeling_cvt.py +2 -0
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/modeling_d_fine.py +2 -0
- transformers/models/d_fine/modular_d_fine.py +1 -0
- transformers/models/dab_detr/modeling_dab_detr.py +4 -0
- transformers/models/dac/modeling_dac.py +2 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/dbrx/modeling_dbrx.py +2 -2
- transformers/models/deberta/modeling_deberta.py +5 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
- transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
- transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
- transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/modeling_detr.py +5 -0
- transformers/models/dia/modeling_dia.py +4 -3
- transformers/models/dia/modular_dia.py +0 -1
- transformers/models/diffllama/modeling_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +2 -3
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +2 -0
- transformers/models/dots1/modeling_dots1.py +10 -7
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/edgetam/modeling_edgetam.py +1 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
- transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
- transformers/models/efficientnet/modeling_efficientnet.py +2 -0
- transformers/models/emu3/modeling_emu3.py +4 -4
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +14 -2
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
- transformers/models/esm/modeling_esmfold.py +5 -4
- transformers/models/evolla/modeling_evolla.py +4 -4
- transformers/models/exaone4/modeling_exaone4.py +2 -2
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +6 -1
- transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
- transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
- transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
- transformers/models/flaubert/modeling_flaubert.py +7 -0
- transformers/models/flava/modeling_flava.py +6 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
- transformers/models/florence2/modeling_florence2.py +2 -1
- transformers/models/florence2/modular_florence2.py +2 -1
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/processing_fuyu.py +3 -3
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +2 -1
- transformers/models/gemma3/modeling_gemma3.py +14 -84
- transformers/models/gemma3/modular_gemma3.py +12 -81
- transformers/models/gemma3n/modeling_gemma3n.py +18 -209
- transformers/models/gemma3n/modular_gemma3n.py +17 -59
- transformers/models/git/modeling_git.py +2 -0
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/modeling_glm4v.py +3 -3
- transformers/models/glm4v/modular_glm4v.py +6 -4
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
- transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/gpt2/modeling_gpt2.py +5 -1
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
- transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
- transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
- transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
- transformers/models/gptj/modeling_gptj.py +3 -0
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granitemoe/modeling_granitemoe.py +4 -6
- transformers/models/granitemoe/modular_granitemoe.py +0 -2
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
- transformers/models/groupvit/modeling_groupvit.py +3 -0
- transformers/models/helium/modeling_helium.py +4 -3
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +3 -0
- transformers/models/hubert/modular_hubert.py +1 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
- transformers/models/ibert/modeling_ibert.py +6 -0
- transformers/models/idefics/modeling_idefics.py +5 -21
- transformers/models/imagegpt/modeling_imagegpt.py +2 -1
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/internvl/modeling_internvl.py +2 -4
- transformers/models/internvl/modular_internvl.py +2 -4
- transformers/models/jamba/modeling_jamba.py +2 -2
- transformers/models/janus/modeling_janus.py +1 -0
- transformers/models/janus/modular_janus.py +1 -0
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/kosmos2/modeling_kosmos2.py +1 -0
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +244 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +729 -0
- transformers/models/lasr/modular_lasr.py +569 -0
- transformers/models/lasr/processing_lasr.py +96 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +5 -0
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +3 -0
- transformers/models/lfm2/modeling_lfm2.py +4 -5
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +4 -0
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/modeling_llama4.py +3 -2
- transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
- transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -0
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +4 -0
- transformers/models/mamba/modeling_mamba.py +14 -22
- transformers/models/marian/modeling_marian.py +5 -0
- transformers/models/markuplm/modeling_markuplm.py +4 -0
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/modeling_mask2former.py +2 -0
- transformers/models/maskformer/modeling_maskformer.py +2 -0
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +3 -1
- transformers/models/minimax/modeling_minimax.py +4 -4
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +4 -3
- transformers/models/mistral/modeling_mistral.py +4 -3
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mllama/modeling_mllama.py +2 -2
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/modeling_mobilevit.py +3 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
- transformers/models/modernbert/modeling_modernbert.py +4 -1
- transformers/models/modernbert/modular_modernbert.py +2 -0
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
- transformers/models/moonshine/modeling_moonshine.py +4 -2
- transformers/models/moshi/modeling_moshi.py +5 -2
- transformers/models/mpnet/modeling_mpnet.py +5 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +6 -0
- transformers/models/mt5/modeling_mt5.py +7 -0
- transformers/models/musicgen/modeling_musicgen.py +2 -0
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nemotron/modeling_nemotron.py +4 -2
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nougat/tokenization_nougat.py +11 -59
- transformers/models/nystromformer/modeling_nystromformer.py +6 -0
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +4 -5
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
- transformers/models/oneformer/modeling_oneformer.py +4 -1
- transformers/models/openai/modeling_openai.py +3 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/owlv2/modeling_owlv2.py +4 -0
- transformers/models/owlvit/modeling_owlvit.py +4 -0
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +9 -6
- transformers/models/parakeet/modular_parakeet.py +2 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
- transformers/models/patchtst/modeling_patchtst.py +20 -2
- transformers/models/pegasus/modeling_pegasus.py +5 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
- transformers/models/perceiver/modeling_perceiver.py +8 -0
- transformers/models/persimmon/modeling_persimmon.py +2 -1
- transformers/models/phi/modeling_phi.py +4 -5
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +2 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
- transformers/models/phimoe/modeling_phimoe.py +4 -4
- transformers/models/phimoe/modular_phimoe.py +2 -2
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pixtral/modeling_pixtral.py +2 -1
- transformers/models/plbart/modeling_plbart.py +6 -0
- transformers/models/plbart/modular_plbart.py +2 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/modeling_poolformer.py +2 -0
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +3 -0
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
- transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
- transformers/models/rag/modeling_rag.py +1 -0
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
- transformers/models/reformer/modeling_reformer.py +4 -0
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +6 -1
- transformers/models/rembert/modeling_rembert.py +6 -0
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +11 -2
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/modeling_rt_detr.py +2 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
- transformers/models/rwkv/modeling_rwkv.py +1 -0
- transformers/models/sam2/modeling_sam2.py +2 -2
- transformers/models/sam2/modular_sam2.py +2 -2
- transformers/models/sam2_video/modeling_sam2_video.py +1 -0
- transformers/models/sam2_video/modular_sam2_video.py +1 -0
- transformers/models/sam3/modeling_sam3.py +77 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
- transformers/models/sam3_video/modeling_sam3_video.py +1 -0
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
- transformers/models/seed_oss/modeling_seed_oss.py +2 -2
- transformers/models/segformer/modeling_segformer.py +4 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/siglip2/modeling_siglip2.py +4 -0
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
- transformers/models/speecht5/modeling_speecht5.py +13 -1
- transformers/models/splinter/modeling_splinter.py +3 -0
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +6 -0
- transformers/models/stablelm/modeling_stablelm.py +3 -1
- transformers/models/starcoder2/modeling_starcoder2.py +4 -3
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +2 -0
- transformers/models/swin/modeling_swin.py +4 -0
- transformers/models/swin2sr/modeling_swin2sr.py +2 -0
- transformers/models/swinv2/modeling_swinv2.py +4 -0
- transformers/models/t5/modeling_t5.py +7 -0
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +5 -5
- transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
- transformers/models/table_transformer/modeling_table_transformer.py +4 -0
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +2 -0
- transformers/models/timesfm/modular_timesfm.py +2 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
- transformers/models/trocr/modeling_trocr.py +2 -0
- transformers/models/tvp/modeling_tvp.py +2 -0
- transformers/models/udop/modeling_udop.py +4 -0
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/modeling_umt5.py +7 -0
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/vilt/modeling_vilt.py +6 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +6 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/modeling_vitmatte.py +1 -0
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/modeling_whisper.py +6 -0
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +3 -0
- transformers/models/xglm/modeling_xglm.py +1 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +5 -0
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/yoso/modeling_yoso.py +6 -0
- transformers/models/zamba/modeling_zamba.py +2 -0
- transformers/models/zamba2/modeling_zamba2.py +4 -2
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/modeling_zoedepth.py +1 -0
- transformers/pipelines/__init__.py +2 -3
- transformers/pipelines/base.py +1 -9
- transformers/pipelines/document_question_answering.py +3 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/processing_utils.py +23 -11
- transformers/quantizers/base.py +35 -110
- transformers/quantizers/quantizer_aqlm.py +1 -5
- transformers/quantizers/quantizer_auto_round.py +1 -2
- transformers/quantizers/quantizer_awq.py +17 -81
- transformers/quantizers/quantizer_bitnet.py +3 -8
- transformers/quantizers/quantizer_bnb_4bit.py +13 -110
- transformers/quantizers/quantizer_bnb_8bit.py +16 -92
- transformers/quantizers/quantizer_compressed_tensors.py +1 -5
- transformers/quantizers/quantizer_eetq.py +14 -62
- transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
- transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
- transformers/quantizers/quantizer_fp_quant.py +48 -78
- transformers/quantizers/quantizer_gptq.py +7 -24
- transformers/quantizers/quantizer_higgs.py +40 -54
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +13 -167
- transformers/quantizers/quantizer_quanto.py +20 -64
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +1 -4
- transformers/quantizers/quantizer_torchao.py +23 -202
- transformers/quantizers/quantizer_vptq.py +8 -22
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +297 -36
- transformers/tokenization_mistral_common.py +4 -0
- transformers/tokenization_utils_base.py +113 -222
- transformers/tokenization_utils_tokenizers.py +168 -107
- transformers/trainer.py +28 -31
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +66 -28
- transformers/utils/__init__.py +3 -4
- transformers/utils/auto_docstring.py +1 -0
- transformers/utils/generic.py +27 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +61 -16
- transformers/utils/kernel_config.py +4 -2
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +75 -242
- transformers/video_processing_utils.py +1 -2
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -1006,6 +1006,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
|
|
|
1006
1006
|
output_attentions: Optional[bool] = None,
|
|
1007
1007
|
output_hidden_states: Optional[bool] = None,
|
|
1008
1008
|
return_dict: Optional[bool] = None,
|
|
1009
|
+
**kwargs,
|
|
1009
1010
|
) -> Union[tuple, UniSpeechSatBaseModelOutput]:
|
|
1010
1011
|
r"""
|
|
1011
1012
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1120,6 +1121,7 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
|
|
|
1120
1121
|
output_attentions: Optional[bool] = None,
|
|
1121
1122
|
output_hidden_states: Optional[bool] = None,
|
|
1122
1123
|
return_dict: Optional[bool] = None,
|
|
1124
|
+
**kwargs,
|
|
1123
1125
|
) -> Union[tuple, UniSpeechSatForPreTrainingOutput]:
|
|
1124
1126
|
r"""
|
|
1125
1127
|
Example:
|
|
@@ -1251,6 +1253,7 @@ class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
|
|
|
1251
1253
|
output_hidden_states: Optional[bool] = None,
|
|
1252
1254
|
return_dict: Optional[bool] = None,
|
|
1253
1255
|
labels: Optional[torch.Tensor] = None,
|
|
1256
|
+
**kwargs,
|
|
1254
1257
|
) -> Union[tuple, CausalLMOutput]:
|
|
1255
1258
|
r"""
|
|
1256
1259
|
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
|
@@ -1362,6 +1365,7 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel):
|
|
|
1362
1365
|
output_hidden_states: Optional[bool] = None,
|
|
1363
1366
|
return_dict: Optional[bool] = None,
|
|
1364
1367
|
labels: Optional[torch.Tensor] = None,
|
|
1368
|
+
**kwargs,
|
|
1365
1369
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1366
1370
|
r"""
|
|
1367
1371
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1465,6 +1469,7 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel):
|
|
|
1465
1469
|
output_attentions: Optional[bool] = None,
|
|
1466
1470
|
output_hidden_states: Optional[bool] = None,
|
|
1467
1471
|
return_dict: Optional[bool] = None,
|
|
1472
|
+
**kwargs,
|
|
1468
1473
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1469
1474
|
r"""
|
|
1470
1475
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1636,6 +1641,7 @@ class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel):
|
|
|
1636
1641
|
output_hidden_states: Optional[bool] = None,
|
|
1637
1642
|
return_dict: Optional[bool] = None,
|
|
1638
1643
|
labels: Optional[torch.Tensor] = None,
|
|
1644
|
+
**kwargs,
|
|
1639
1645
|
) -> Union[tuple, XVectorOutput]:
|
|
1640
1646
|
r"""
|
|
1641
1647
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -255,6 +255,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel, Wav2Vec2Model):
|
|
|
255
255
|
output_attentions: Optional[bool] = None,
|
|
256
256
|
output_hidden_states: Optional[bool] = None,
|
|
257
257
|
return_dict: Optional[bool] = None,
|
|
258
|
+
**kwargs,
|
|
258
259
|
) -> Union[tuple, UniSpeechSatBaseModelOutput]:
|
|
259
260
|
r"""
|
|
260
261
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -369,6 +370,7 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
|
|
|
369
370
|
output_attentions: Optional[bool] = None,
|
|
370
371
|
output_hidden_states: Optional[bool] = None,
|
|
371
372
|
return_dict: Optional[bool] = None,
|
|
373
|
+
**kwargs,
|
|
372
374
|
) -> Union[tuple, UniSpeechSatForPreTrainingOutput]:
|
|
373
375
|
r"""
|
|
374
376
|
Example:
|
|
@@ -476,6 +476,7 @@ class UnivNetModel(PreTrainedModel):
|
|
|
476
476
|
padding_mask: Optional[torch.FloatTensor] = None,
|
|
477
477
|
generator: Optional[torch.Generator] = None,
|
|
478
478
|
return_dict: Optional[bool] = None,
|
|
479
|
+
**kwargs,
|
|
479
480
|
) -> Union[tuple[torch.FloatTensor], UnivNetModelOutput]:
|
|
480
481
|
r"""
|
|
481
482
|
noise_sequence (`torch.FloatTensor`, *optional*):
|
|
@@ -301,6 +301,7 @@ class UperNetForSemanticSegmentation(UperNetPreTrainedModel):
|
|
|
301
301
|
output_hidden_states: Optional[bool] = None,
|
|
302
302
|
labels: Optional[torch.Tensor] = None,
|
|
303
303
|
return_dict: Optional[bool] = None,
|
|
304
|
+
**kwargs,
|
|
304
305
|
) -> Union[tuple, SemanticSegmenterOutput]:
|
|
305
306
|
r"""
|
|
306
307
|
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
|
|
@@ -29,7 +29,7 @@ from ... import initialization as init
|
|
|
29
29
|
from ...activations import ACT2FN
|
|
30
30
|
from ...cache_utils import Cache, DynamicCache
|
|
31
31
|
from ...generation import GenerationMixin
|
|
32
|
-
from ...integrations import use_kernel_func_from_hub
|
|
32
|
+
from ...integrations import use_kernel_func_from_hub, use_kernelized_func
|
|
33
33
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
34
34
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
35
35
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
@@ -38,7 +38,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
38
38
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
39
39
|
from ...processing_utils import Unpack
|
|
40
40
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
41
|
-
from ...utils.generic import check_model_inputs
|
|
41
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
42
42
|
from .configuration_vaultgemma import VaultGemmaConfig
|
|
43
43
|
|
|
44
44
|
|
|
@@ -160,6 +160,7 @@ def eager_attention_forward(
|
|
|
160
160
|
return attn_output, attn_weights
|
|
161
161
|
|
|
162
162
|
|
|
163
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
163
164
|
class VaultGemmaAttention(nn.Module):
|
|
164
165
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
165
166
|
|
|
@@ -186,7 +187,6 @@ class VaultGemmaAttention(nn.Module):
|
|
|
186
187
|
self.o_proj = nn.Linear(
|
|
187
188
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
188
189
|
)
|
|
189
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
190
190
|
self.attn_logit_softcapping = self.config.attn_logit_softcapping
|
|
191
191
|
self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None
|
|
192
192
|
|
|
@@ -336,7 +336,7 @@ class VaultGemmaRotaryEmbedding(nn.Module):
|
|
|
336
336
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
337
337
|
|
|
338
338
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
339
|
-
with
|
|
339
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
340
340
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
341
341
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
342
342
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -556,6 +556,7 @@ class ViltModel(ViltPreTrainedModel):
|
|
|
556
556
|
output_attentions: Optional[bool] = None,
|
|
557
557
|
output_hidden_states: Optional[bool] = None,
|
|
558
558
|
return_dict: Optional[bool] = None,
|
|
559
|
+
**kwargs,
|
|
559
560
|
) -> Union[BaseModelOutputWithPooling, tuple[torch.FloatTensor]]:
|
|
560
561
|
r"""
|
|
561
562
|
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
|
|
@@ -708,6 +709,7 @@ class ViltForMaskedLM(ViltPreTrainedModel):
|
|
|
708
709
|
output_attentions: Optional[bool] = None,
|
|
709
710
|
output_hidden_states: Optional[bool] = None,
|
|
710
711
|
return_dict: Optional[bool] = None,
|
|
712
|
+
**kwargs,
|
|
711
713
|
) -> Union[MaskedLMOutput, tuple[torch.FloatTensor]]:
|
|
712
714
|
r"""
|
|
713
715
|
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
|
|
@@ -875,6 +877,7 @@ class ViltForQuestionAnswering(ViltPreTrainedModel):
|
|
|
875
877
|
output_attentions: Optional[bool] = None,
|
|
876
878
|
output_hidden_states: Optional[bool] = None,
|
|
877
879
|
return_dict: Optional[bool] = None,
|
|
880
|
+
**kwargs,
|
|
878
881
|
) -> Union[SequenceClassifierOutput, tuple[torch.FloatTensor]]:
|
|
879
882
|
r"""
|
|
880
883
|
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
|
|
@@ -979,6 +982,7 @@ class ViltForImageAndTextRetrieval(ViltPreTrainedModel):
|
|
|
979
982
|
output_attentions: Optional[bool] = None,
|
|
980
983
|
output_hidden_states: Optional[bool] = None,
|
|
981
984
|
return_dict: Optional[bool] = None,
|
|
985
|
+
**kwargs,
|
|
982
986
|
) -> Union[SequenceClassifierOutput, tuple[torch.FloatTensor]]:
|
|
983
987
|
r"""
|
|
984
988
|
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
|
|
@@ -1082,6 +1086,7 @@ class ViltForImagesAndTextClassification(ViltPreTrainedModel):
|
|
|
1082
1086
|
output_attentions: Optional[bool] = None,
|
|
1083
1087
|
output_hidden_states: Optional[bool] = None,
|
|
1084
1088
|
return_dict: Optional[bool] = None,
|
|
1089
|
+
**kwargs,
|
|
1085
1090
|
) -> Union[ViltForImagesAndTextClassificationOutput, tuple[torch.FloatTensor]]:
|
|
1086
1091
|
r"""
|
|
1087
1092
|
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
|
|
@@ -1210,6 +1215,7 @@ class ViltForTokenClassification(ViltPreTrainedModel):
|
|
|
1210
1215
|
output_attentions: Optional[bool] = None,
|
|
1211
1216
|
output_hidden_states: Optional[bool] = None,
|
|
1212
1217
|
return_dict: Optional[bool] = None,
|
|
1218
|
+
**kwargs,
|
|
1213
1219
|
) -> Union[TokenClassifierOutput, tuple[torch.FloatTensor]]:
|
|
1214
1220
|
r"""
|
|
1215
1221
|
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
|
|
@@ -184,6 +184,7 @@ class VisionTextDualEncoderModel(PreTrainedModel):
|
|
|
184
184
|
output_attentions: Optional[bool] = None,
|
|
185
185
|
output_hidden_states: Optional[bool] = None,
|
|
186
186
|
return_dict: Optional[bool] = None,
|
|
187
|
+
**kwargs,
|
|
187
188
|
) -> Union[tuple[torch.Tensor], CLIPOutput]:
|
|
188
189
|
r"""
|
|
189
190
|
return_loss (`bool`, *optional*):
|
|
@@ -550,6 +550,7 @@ class VisualBertModel(VisualBertPreTrainedModel):
|
|
|
550
550
|
output_attentions: Optional[bool] = None,
|
|
551
551
|
output_hidden_states: Optional[bool] = None,
|
|
552
552
|
return_dict: Optional[bool] = None,
|
|
553
|
+
**kwargs,
|
|
553
554
|
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPooling]:
|
|
554
555
|
r"""
|
|
555
556
|
visual_embeds (`torch.FloatTensor` of shape `(batch_size, visual_seq_length, visual_embedding_dim)`, *optional*):
|
|
@@ -735,6 +736,7 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):
|
|
|
735
736
|
return_dict: Optional[bool] = None,
|
|
736
737
|
labels: Optional[torch.LongTensor] = None,
|
|
737
738
|
sentence_image_labels: Optional[torch.LongTensor] = None,
|
|
739
|
+
**kwargs,
|
|
738
740
|
) -> Union[tuple[torch.Tensor], VisualBertForPreTrainingOutput]:
|
|
739
741
|
r"""
|
|
740
742
|
visual_embeds (`torch.FloatTensor` of shape `(batch_size, visual_seq_length, visual_embedding_dim)`, *optional*):
|
|
@@ -877,6 +879,7 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel):
|
|
|
877
879
|
output_hidden_states: Optional[bool] = None,
|
|
878
880
|
return_dict: Optional[bool] = None,
|
|
879
881
|
labels: Optional[torch.LongTensor] = None,
|
|
882
|
+
**kwargs,
|
|
880
883
|
) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]:
|
|
881
884
|
r"""
|
|
882
885
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -1063,6 +1066,7 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
|
|
|
1063
1066
|
output_hidden_states: Optional[bool] = None,
|
|
1064
1067
|
return_dict: Optional[bool] = None,
|
|
1065
1068
|
labels: Optional[torch.LongTensor] = None,
|
|
1069
|
+
**kwargs,
|
|
1066
1070
|
) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
|
|
1067
1071
|
r"""
|
|
1068
1072
|
visual_embeds (`torch.FloatTensor` of shape `(batch_size, visual_seq_length, visual_embedding_dim)`, *optional*):
|
|
@@ -1199,6 +1203,7 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel):
|
|
|
1199
1203
|
output_hidden_states: Optional[bool] = None,
|
|
1200
1204
|
return_dict: Optional[bool] = None,
|
|
1201
1205
|
labels: Optional[torch.LongTensor] = None,
|
|
1206
|
+
**kwargs,
|
|
1202
1207
|
) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
|
|
1203
1208
|
r"""
|
|
1204
1209
|
visual_embeds (`torch.FloatTensor` of shape `(batch_size, visual_seq_length, visual_embedding_dim)`, *optional*):
|
|
@@ -1372,6 +1377,7 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
|
|
|
1372
1377
|
return_dict: Optional[bool] = None,
|
|
1373
1378
|
region_to_phrase_position: Optional[torch.LongTensor] = None,
|
|
1374
1379
|
labels: Optional[torch.LongTensor] = None,
|
|
1380
|
+
**kwargs,
|
|
1375
1381
|
) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
|
|
1376
1382
|
r"""
|
|
1377
1383
|
visual_embeds (`torch.FloatTensor` of shape `(batch_size, visual_seq_length, visual_embedding_dim)`, *optional*):
|
|
@@ -630,6 +630,7 @@ class VitDetModel(VitDetPreTrainedModel):
|
|
|
630
630
|
output_attentions: Optional[bool] = None,
|
|
631
631
|
output_hidden_states: Optional[bool] = None,
|
|
632
632
|
return_dict: Optional[bool] = None,
|
|
633
|
+
**kwargs,
|
|
633
634
|
) -> Union[tuple, BaseModelOutput]:
|
|
634
635
|
r"""
|
|
635
636
|
Examples:
|
|
@@ -706,6 +707,7 @@ class VitDetBackbone(VitDetPreTrainedModel, BackboneMixin):
|
|
|
706
707
|
output_hidden_states: Optional[bool] = None,
|
|
707
708
|
output_attentions: Optional[bool] = None,
|
|
708
709
|
return_dict: Optional[bool] = None,
|
|
710
|
+
**kwargs,
|
|
709
711
|
) -> BackboneOutput:
|
|
710
712
|
r"""
|
|
711
713
|
Examples:
|
|
@@ -234,6 +234,7 @@ class VitMatteForImageMatting(VitMattePreTrainedModel):
|
|
|
234
234
|
output_hidden_states: Optional[bool] = None,
|
|
235
235
|
labels: Optional[torch.Tensor] = None,
|
|
236
236
|
return_dict: Optional[bool] = None,
|
|
237
|
+
**kwargs,
|
|
237
238
|
):
|
|
238
239
|
r"""
|
|
239
240
|
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
|
|
@@ -1275,6 +1275,7 @@ class VitsModel(VitsPreTrainedModel):
|
|
|
1275
1275
|
output_hidden_states: Optional[bool] = None,
|
|
1276
1276
|
return_dict: Optional[bool] = None,
|
|
1277
1277
|
labels: Optional[torch.FloatTensor] = None,
|
|
1278
|
+
**kwargs,
|
|
1278
1279
|
) -> Union[tuple[Any], VitsModelOutput]:
|
|
1279
1280
|
r"""
|
|
1280
1281
|
speaker_id (`int`, *optional*):
|
|
@@ -1088,6 +1088,7 @@ class VJEPA2ForVideoClassification(VJEPA2PreTrainedModel):
|
|
|
1088
1088
|
labels: Optional[torch.Tensor] = None,
|
|
1089
1089
|
output_attentions: Optional[bool] = None,
|
|
1090
1090
|
output_hidden_states: Optional[bool] = None,
|
|
1091
|
+
**kwargs,
|
|
1091
1092
|
) -> Union[tuple, ImageClassifierOutput]:
|
|
1092
1093
|
r"""
|
|
1093
1094
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1340,6 +1340,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
|
|
|
1340
1340
|
output_attentions: Optional[bool] = None,
|
|
1341
1341
|
output_hidden_states: Optional[bool] = None,
|
|
1342
1342
|
return_dict: Optional[bool] = None,
|
|
1343
|
+
**kwargs,
|
|
1343
1344
|
) -> Union[tuple, Wav2Vec2BaseModelOutput]:
|
|
1344
1345
|
r"""
|
|
1345
1346
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1453,6 +1454,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
|
|
|
1453
1454
|
output_attentions: Optional[bool] = None,
|
|
1454
1455
|
output_hidden_states: Optional[bool] = None,
|
|
1455
1456
|
return_dict: Optional[bool] = None,
|
|
1457
|
+
**kwargs,
|
|
1456
1458
|
) -> Union[tuple, Wav2Vec2ForPreTrainingOutput]:
|
|
1457
1459
|
r"""
|
|
1458
1460
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1628,6 +1630,7 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
|
|
|
1628
1630
|
output_hidden_states: Optional[bool] = None,
|
|
1629
1631
|
return_dict: Optional[bool] = None,
|
|
1630
1632
|
labels: Optional[torch.Tensor] = None,
|
|
1633
|
+
**kwargs,
|
|
1631
1634
|
) -> Union[tuple, MaskedLMOutput]:
|
|
1632
1635
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
1633
1636
|
|
|
@@ -1729,6 +1732,7 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
|
|
|
1729
1732
|
output_hidden_states: Optional[bool] = None,
|
|
1730
1733
|
return_dict: Optional[bool] = None,
|
|
1731
1734
|
labels: Optional[torch.Tensor] = None,
|
|
1735
|
+
**kwargs,
|
|
1732
1736
|
) -> Union[tuple, CausalLMOutput]:
|
|
1733
1737
|
r"""
|
|
1734
1738
|
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
|
@@ -1840,6 +1844,7 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):
|
|
|
1840
1844
|
output_hidden_states: Optional[bool] = None,
|
|
1841
1845
|
return_dict: Optional[bool] = None,
|
|
1842
1846
|
labels: Optional[torch.Tensor] = None,
|
|
1847
|
+
**kwargs,
|
|
1843
1848
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1844
1849
|
r"""
|
|
1845
1850
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1943,6 +1948,7 @@ class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel):
|
|
|
1943
1948
|
output_attentions: Optional[bool] = None,
|
|
1944
1949
|
output_hidden_states: Optional[bool] = None,
|
|
1945
1950
|
return_dict: Optional[bool] = None,
|
|
1951
|
+
**kwargs,
|
|
1946
1952
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1947
1953
|
r"""
|
|
1948
1954
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -2114,6 +2120,7 @@ class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel):
|
|
|
2114
2120
|
output_hidden_states: Optional[bool] = None,
|
|
2115
2121
|
return_dict: Optional[bool] = None,
|
|
2116
2122
|
labels: Optional[torch.Tensor] = None,
|
|
2123
|
+
**kwargs,
|
|
2117
2124
|
) -> Union[tuple, XVectorOutput]:
|
|
2118
2125
|
r"""
|
|
2119
2126
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -994,6 +994,7 @@ class Wav2Vec2BertModel(Wav2Vec2BertPreTrainedModel):
|
|
|
994
994
|
output_attentions: Optional[bool] = None,
|
|
995
995
|
output_hidden_states: Optional[bool] = None,
|
|
996
996
|
return_dict: Optional[bool] = None,
|
|
997
|
+
**kwargs,
|
|
997
998
|
) -> Union[tuple, Wav2Vec2BertBaseModelOutput]:
|
|
998
999
|
r"""
|
|
999
1000
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1086,6 +1087,7 @@ class Wav2Vec2BertForCTC(Wav2Vec2BertPreTrainedModel):
|
|
|
1086
1087
|
output_hidden_states: Optional[bool] = None,
|
|
1087
1088
|
return_dict: Optional[bool] = None,
|
|
1088
1089
|
labels: Optional[torch.Tensor] = None,
|
|
1090
|
+
**kwargs,
|
|
1089
1091
|
) -> Union[tuple, CausalLMOutput]:
|
|
1090
1092
|
r"""
|
|
1091
1093
|
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
|
@@ -1192,6 +1194,7 @@ class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel):
|
|
|
1192
1194
|
output_hidden_states: Optional[bool] = None,
|
|
1193
1195
|
return_dict: Optional[bool] = None,
|
|
1194
1196
|
labels: Optional[torch.Tensor] = None,
|
|
1197
|
+
**kwargs,
|
|
1195
1198
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1196
1199
|
r"""
|
|
1197
1200
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1282,6 +1285,7 @@ class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2BertPreTrainedModel):
|
|
|
1282
1285
|
output_attentions: Optional[bool] = None,
|
|
1283
1286
|
output_hidden_states: Optional[bool] = None,
|
|
1284
1287
|
return_dict: Optional[bool] = None,
|
|
1288
|
+
**kwargs,
|
|
1285
1289
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1286
1290
|
r"""
|
|
1287
1291
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1440,6 +1444,7 @@ class Wav2Vec2BertForXVector(Wav2Vec2BertPreTrainedModel):
|
|
|
1440
1444
|
output_hidden_states: Optional[bool] = None,
|
|
1441
1445
|
return_dict: Optional[bool] = None,
|
|
1442
1446
|
labels: Optional[torch.Tensor] = None,
|
|
1447
|
+
**kwargs,
|
|
1443
1448
|
) -> Union[tuple, XVectorOutput]:
|
|
1444
1449
|
r"""
|
|
1445
1450
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -702,6 +702,7 @@ class Wav2Vec2BertModel(Wav2Vec2Model, Wav2Vec2BertPreTrainedModel):
|
|
|
702
702
|
output_attentions: Optional[bool] = None,
|
|
703
703
|
output_hidden_states: Optional[bool] = None,
|
|
704
704
|
return_dict: Optional[bool] = None,
|
|
705
|
+
**kwargs,
|
|
705
706
|
) -> Union[tuple, Wav2Vec2BertBaseModelOutput]:
|
|
706
707
|
r"""
|
|
707
708
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -768,6 +769,7 @@ class Wav2Vec2BertForCTC(Wav2Vec2ConformerForCTC):
|
|
|
768
769
|
output_hidden_states: Optional[bool] = None,
|
|
769
770
|
return_dict: Optional[bool] = None,
|
|
770
771
|
labels: Optional[torch.Tensor] = None,
|
|
772
|
+
**kwargs,
|
|
771
773
|
) -> Union[tuple, CausalLMOutput]:
|
|
772
774
|
r"""
|
|
773
775
|
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
|
@@ -856,6 +858,7 @@ class Wav2Vec2BertForSequenceClassification(Wav2Vec2ForSequenceClassification):
|
|
|
856
858
|
output_hidden_states: Optional[bool] = None,
|
|
857
859
|
return_dict: Optional[bool] = None,
|
|
858
860
|
labels: Optional[torch.Tensor] = None,
|
|
861
|
+
**kwargs,
|
|
859
862
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
860
863
|
r"""
|
|
861
864
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -926,6 +929,7 @@ class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2ConformerForAudioFrameClas
|
|
|
926
929
|
output_attentions: Optional[bool] = None,
|
|
927
930
|
output_hidden_states: Optional[bool] = None,
|
|
928
931
|
return_dict: Optional[bool] = None,
|
|
932
|
+
**kwargs,
|
|
929
933
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
930
934
|
r"""
|
|
931
935
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -987,6 +991,7 @@ class Wav2Vec2BertForXVector(Wav2Vec2ConformerForXVector):
|
|
|
987
991
|
output_hidden_states: Optional[bool] = None,
|
|
988
992
|
return_dict: Optional[bool] = None,
|
|
989
993
|
labels: Optional[torch.Tensor] = None,
|
|
994
|
+
**kwargs,
|
|
990
995
|
) -> Union[tuple, XVectorOutput]:
|
|
991
996
|
r"""
|
|
992
997
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1142,6 +1142,7 @@ class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel):
|
|
|
1142
1142
|
output_attentions: Optional[bool] = None,
|
|
1143
1143
|
output_hidden_states: Optional[bool] = None,
|
|
1144
1144
|
return_dict: Optional[bool] = None,
|
|
1145
|
+
**kwargs,
|
|
1145
1146
|
) -> Union[tuple, Wav2Vec2ConformerBaseModelOutput]:
|
|
1146
1147
|
r"""
|
|
1147
1148
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1255,6 +1256,7 @@ class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel):
|
|
|
1255
1256
|
output_attentions: Optional[bool] = None,
|
|
1256
1257
|
output_hidden_states: Optional[bool] = None,
|
|
1257
1258
|
return_dict: Optional[bool] = None,
|
|
1259
|
+
**kwargs,
|
|
1258
1260
|
) -> Union[tuple, Wav2Vec2ConformerForPreTrainingOutput]:
|
|
1259
1261
|
r"""
|
|
1260
1262
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1459,6 +1461,7 @@ class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
|
|
|
1459
1461
|
output_hidden_states: Optional[bool] = None,
|
|
1460
1462
|
return_dict: Optional[bool] = None,
|
|
1461
1463
|
labels: Optional[torch.Tensor] = None,
|
|
1464
|
+
**kwargs,
|
|
1462
1465
|
) -> Union[tuple, CausalLMOutput]:
|
|
1463
1466
|
r"""
|
|
1464
1467
|
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
|
@@ -1570,6 +1573,7 @@ class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedMode
|
|
|
1570
1573
|
output_hidden_states: Optional[bool] = None,
|
|
1571
1574
|
return_dict: Optional[bool] = None,
|
|
1572
1575
|
labels: Optional[torch.Tensor] = None,
|
|
1576
|
+
**kwargs,
|
|
1573
1577
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1574
1578
|
r"""
|
|
1575
1579
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1673,6 +1677,7 @@ class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedMo
|
|
|
1673
1677
|
output_attentions: Optional[bool] = None,
|
|
1674
1678
|
output_hidden_states: Optional[bool] = None,
|
|
1675
1679
|
return_dict: Optional[bool] = None,
|
|
1680
|
+
**kwargs,
|
|
1676
1681
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1677
1682
|
r"""
|
|
1678
1683
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1844,6 +1849,7 @@ class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel):
|
|
|
1844
1849
|
output_hidden_states: Optional[bool] = None,
|
|
1845
1850
|
return_dict: Optional[bool] = None,
|
|
1846
1851
|
labels: Optional[torch.Tensor] = None,
|
|
1852
|
+
**kwargs,
|
|
1847
1853
|
) -> Union[tuple, XVectorOutput]:
|
|
1848
1854
|
r"""
|
|
1849
1855
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1047,6 +1047,7 @@ class WavLMModel(WavLMPreTrainedModel):
|
|
|
1047
1047
|
output_attentions: Optional[bool] = None,
|
|
1048
1048
|
output_hidden_states: Optional[bool] = None,
|
|
1049
1049
|
return_dict: Optional[bool] = None,
|
|
1050
|
+
**kwargs,
|
|
1050
1051
|
) -> Union[tuple, WavLMBaseModelOutput]:
|
|
1051
1052
|
r"""
|
|
1052
1053
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1180,6 +1181,7 @@ class WavLMForCTC(WavLMPreTrainedModel):
|
|
|
1180
1181
|
output_hidden_states: Optional[bool] = None,
|
|
1181
1182
|
return_dict: Optional[bool] = None,
|
|
1182
1183
|
labels: Optional[torch.Tensor] = None,
|
|
1184
|
+
**kwargs,
|
|
1183
1185
|
) -> Union[tuple, CausalLMOutput]:
|
|
1184
1186
|
r"""
|
|
1185
1187
|
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
|
@@ -1291,6 +1293,7 @@ class WavLMForSequenceClassification(WavLMPreTrainedModel):
|
|
|
1291
1293
|
output_hidden_states: Optional[bool] = None,
|
|
1292
1294
|
return_dict: Optional[bool] = None,
|
|
1293
1295
|
labels: Optional[torch.Tensor] = None,
|
|
1296
|
+
**kwargs,
|
|
1294
1297
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1295
1298
|
r"""
|
|
1296
1299
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1394,6 +1397,7 @@ class WavLMForAudioFrameClassification(WavLMPreTrainedModel):
|
|
|
1394
1397
|
output_attentions: Optional[bool] = None,
|
|
1395
1398
|
output_hidden_states: Optional[bool] = None,
|
|
1396
1399
|
return_dict: Optional[bool] = None,
|
|
1400
|
+
**kwargs,
|
|
1397
1401
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1398
1402
|
r"""
|
|
1399
1403
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1565,6 +1569,7 @@ class WavLMForXVector(WavLMPreTrainedModel):
|
|
|
1565
1569
|
output_hidden_states: Optional[bool] = None,
|
|
1566
1570
|
return_dict: Optional[bool] = None,
|
|
1567
1571
|
labels: Optional[torch.Tensor] = None,
|
|
1572
|
+
**kwargs,
|
|
1568
1573
|
) -> Union[tuple, XVectorOutput]:
|
|
1569
1574
|
r"""
|
|
1570
1575
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -608,6 +608,7 @@ class WhisperEncoder(WhisperPreTrainedModel):
|
|
|
608
608
|
output_attentions=None,
|
|
609
609
|
output_hidden_states=None,
|
|
610
610
|
return_dict=None,
|
|
611
|
+
**kwargs,
|
|
611
612
|
):
|
|
612
613
|
r"""
|
|
613
614
|
Args:
|
|
@@ -734,6 +735,7 @@ class WhisperDecoder(WhisperPreTrainedModel):
|
|
|
734
735
|
output_hidden_states=None,
|
|
735
736
|
return_dict=None,
|
|
736
737
|
cache_position=None,
|
|
738
|
+
**kwargs,
|
|
737
739
|
):
|
|
738
740
|
r"""
|
|
739
741
|
Args:
|
|
@@ -982,6 +984,7 @@ class WhisperModel(WhisperPreTrainedModel):
|
|
|
982
984
|
output_hidden_states: Optional[bool] = None,
|
|
983
985
|
return_dict: Optional[bool] = None,
|
|
984
986
|
cache_position: Optional[torch.LongTensor] = None,
|
|
987
|
+
**kwargs,
|
|
985
988
|
) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
|
|
986
989
|
r"""
|
|
987
990
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1129,6 +1132,7 @@ class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedM
|
|
|
1129
1132
|
output_hidden_states: Optional[bool] = None,
|
|
1130
1133
|
return_dict: Optional[bool] = None,
|
|
1131
1134
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1135
|
+
**kwargs,
|
|
1132
1136
|
) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
|
|
1133
1137
|
r"""
|
|
1134
1138
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1299,6 +1303,7 @@ class WhisperForCausalLM(WhisperPreTrainedModel, GenerationMixin):
|
|
|
1299
1303
|
output_hidden_states: Optional[bool] = None,
|
|
1300
1304
|
return_dict: Optional[bool] = None,
|
|
1301
1305
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1306
|
+
**kwargs,
|
|
1302
1307
|
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
|
|
1303
1308
|
r"""
|
|
1304
1309
|
encoder_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
|
@@ -1422,6 +1427,7 @@ class WhisperForAudioClassification(WhisperPreTrainedModel):
|
|
|
1422
1427
|
output_attentions: Optional[bool] = None,
|
|
1423
1428
|
output_hidden_states: Optional[bool] = None,
|
|
1424
1429
|
return_dict: Optional[bool] = None,
|
|
1430
|
+
**kwargs,
|
|
1425
1431
|
) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
|
|
1426
1432
|
r"""
|
|
1427
1433
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -19,7 +19,7 @@ import os
|
|
|
19
19
|
import re
|
|
20
20
|
import warnings
|
|
21
21
|
from functools import lru_cache
|
|
22
|
-
from typing import Optional
|
|
22
|
+
from typing import Optional, Union
|
|
23
23
|
|
|
24
24
|
import numpy as np
|
|
25
25
|
from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors
|
|
@@ -204,10 +204,11 @@ class WhisperTokenizer(TokenizersBackend):
|
|
|
204
204
|
|
|
205
205
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
206
206
|
model_input_names = ["input_ids", "attention_mask"]
|
|
207
|
+
model = BPE
|
|
207
208
|
|
|
208
209
|
def __init__(
|
|
209
210
|
self,
|
|
210
|
-
vocab=None,
|
|
211
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
211
212
|
merges=None,
|
|
212
213
|
normalizer_file=None,
|
|
213
214
|
unk_token="<|endoftext|>",
|
|
@@ -253,7 +254,6 @@ class WhisperTokenizer(TokenizersBackend):
|
|
|
253
254
|
self._tokenizer.decoder = decoders.ByteLevel()
|
|
254
255
|
|
|
255
256
|
super().__init__(
|
|
256
|
-
tokenizer_object=self._tokenizer,
|
|
257
257
|
unk_token=unk_token,
|
|
258
258
|
bos_token=bos_token,
|
|
259
259
|
eos_token=eos_token,
|
|
@@ -276,18 +276,7 @@ class WhisperTokenizer(TokenizersBackend):
|
|
|
276
276
|
self.language = language
|
|
277
277
|
self.task = task
|
|
278
278
|
self.predict_timestamps = predict_timestamps
|
|
279
|
-
|
|
280
|
-
self._post_init()
|
|
281
|
-
|
|
282
|
-
def _post_init(self):
|
|
283
|
-
"""Post-initialization hook to set up prefix tokens after the tokenizer is fully loaded."""
|
|
284
|
-
super()._post_init()
|
|
285
|
-
# Set up prefix tokens if language or task is specified (may be set from config in from_pretrained)
|
|
286
|
-
if hasattr(self, "language") and hasattr(self, "task") and hasattr(self, "predict_timestamps"):
|
|
287
|
-
if self.language is not None or self.task is not None:
|
|
288
|
-
self.set_prefix_tokens(
|
|
289
|
-
language=self.language, task=self.task, predict_timestamps=self.predict_timestamps
|
|
290
|
-
)
|
|
279
|
+
self.set_prefix_tokens()
|
|
291
280
|
|
|
292
281
|
# Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._decode_with_timestamps
|
|
293
282
|
def _decode_with_timestamps(
|
|
@@ -737,6 +737,7 @@ class XCLIPTextModel(XCLIPPreTrainedModel):
|
|
|
737
737
|
output_attentions: Optional[bool] = None,
|
|
738
738
|
output_hidden_states: Optional[bool] = None,
|
|
739
739
|
return_dict: Optional[bool] = None,
|
|
740
|
+
**kwargs,
|
|
740
741
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
741
742
|
r"""
|
|
742
743
|
Examples:
|
|
@@ -927,6 +928,7 @@ class XCLIPVisionModel(XCLIPPreTrainedModel):
|
|
|
927
928
|
output_attentions: Optional[bool] = None,
|
|
928
929
|
output_hidden_states: Optional[bool] = None,
|
|
929
930
|
return_dict: Optional[bool] = None,
|
|
931
|
+
**kwargs,
|
|
930
932
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
931
933
|
r"""
|
|
932
934
|
Examples:
|
|
@@ -1340,6 +1342,7 @@ class XCLIPModel(XCLIPPreTrainedModel):
|
|
|
1340
1342
|
output_hidden_states: Optional[bool] = None,
|
|
1341
1343
|
interpolate_pos_encoding: bool = False,
|
|
1342
1344
|
return_dict: Optional[bool] = None,
|
|
1345
|
+
**kwargs,
|
|
1343
1346
|
) -> Union[tuple, XCLIPOutput]:
|
|
1344
1347
|
r"""
|
|
1345
1348
|
return_loss (`bool`, *optional*):
|
|
@@ -407,6 +407,7 @@ class XGLMModel(XGLMPreTrainedModel):
|
|
|
407
407
|
output_hidden_states: Optional[bool] = None,
|
|
408
408
|
return_dict: Optional[bool] = None,
|
|
409
409
|
cache_position: Optional[torch.Tensor] = None,
|
|
410
|
+
**kwargs,
|
|
410
411
|
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
|
|
411
412
|
r"""
|
|
412
413
|
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|