transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +30 -3
- transformers/cli/serve.py +47 -17
- transformers/conversion_mapping.py +15 -2
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +196 -135
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +1 -2
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +1 -2
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/configuration_utils.py +3 -2
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/continuous_api.py +134 -79
- transformers/image_processing_base.py +1 -2
- transformers/integrations/__init__.py +4 -2
- transformers/integrations/accelerate.py +15 -3
- transformers/integrations/aqlm.py +38 -66
- transformers/integrations/awq.py +48 -514
- transformers/integrations/bitnet.py +45 -100
- transformers/integrations/bitsandbytes.py +79 -191
- transformers/integrations/deepspeed.py +1 -0
- transformers/integrations/eetq.py +84 -79
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +236 -193
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +40 -62
- transformers/integrations/hub_kernels.py +42 -3
- transformers/integrations/integration_utils.py +10 -0
- transformers/integrations/mxfp4.py +25 -65
- transformers/integrations/peft.py +7 -29
- transformers/integrations/quanto.py +73 -55
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +44 -90
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +42 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +8 -0
- transformers/modeling_rope_utils.py +30 -6
- transformers/modeling_utils.py +116 -112
- transformers/models/__init__.py +3 -0
- transformers/models/afmoe/modeling_afmoe.py +4 -4
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +2 -0
- transformers/models/altclip/modeling_altclip.py +4 -0
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/modeling_aria.py +4 -4
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/auto/configuration_auto.py +11 -0
- transformers/models/auto/feature_extraction_auto.py +2 -0
- transformers/models/auto/image_processing_auto.py +1 -0
- transformers/models/auto/modeling_auto.py +6 -0
- transformers/models/auto/processing_auto.py +18 -10
- transformers/models/auto/tokenization_auto.py +74 -472
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/bamba/modeling_bamba.py +4 -3
- transformers/models/bark/modeling_bark.py +2 -0
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/big_bird/modeling_big_bird.py +6 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +11 -2
- transformers/models/bitnet/modeling_bitnet.py +4 -4
- transformers/models/blenderbot/modeling_blenderbot.py +5 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
- transformers/models/blip/modeling_blip_text.py +2 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -1
- transformers/models/bloom/modeling_bloom.py +4 -0
- transformers/models/blt/modeling_blt.py +2 -2
- transformers/models/blt/modular_blt.py +2 -2
- transformers/models/bridgetower/modeling_bridgetower.py +5 -1
- transformers/models/bros/modeling_bros.py +4 -0
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +5 -0
- transformers/models/chameleon/modeling_chameleon.py +2 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
- transformers/models/clap/modeling_clap.py +5 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +5 -0
- transformers/models/clvp/modeling_clvp.py +5 -0
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +4 -3
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +7 -6
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
- transformers/models/convbert/modeling_convbert.py +6 -0
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/modeling_csm.py +4 -3
- transformers/models/ctrl/modeling_ctrl.py +1 -0
- transformers/models/cvt/modeling_cvt.py +2 -0
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/modeling_d_fine.py +2 -0
- transformers/models/d_fine/modular_d_fine.py +1 -0
- transformers/models/dab_detr/modeling_dab_detr.py +4 -0
- transformers/models/dac/modeling_dac.py +2 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/dbrx/modeling_dbrx.py +2 -2
- transformers/models/deberta/modeling_deberta.py +5 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
- transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
- transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
- transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/modeling_detr.py +5 -0
- transformers/models/dia/modeling_dia.py +4 -3
- transformers/models/dia/modular_dia.py +0 -1
- transformers/models/diffllama/modeling_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +2 -3
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +2 -0
- transformers/models/dots1/modeling_dots1.py +10 -7
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/edgetam/modeling_edgetam.py +1 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
- transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
- transformers/models/efficientnet/modeling_efficientnet.py +2 -0
- transformers/models/emu3/modeling_emu3.py +4 -4
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +14 -2
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
- transformers/models/esm/modeling_esmfold.py +5 -4
- transformers/models/evolla/modeling_evolla.py +4 -4
- transformers/models/exaone4/modeling_exaone4.py +2 -2
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +6 -1
- transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
- transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
- transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
- transformers/models/flaubert/modeling_flaubert.py +7 -0
- transformers/models/flava/modeling_flava.py +6 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
- transformers/models/florence2/modeling_florence2.py +2 -1
- transformers/models/florence2/modular_florence2.py +2 -1
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/processing_fuyu.py +3 -3
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +2 -1
- transformers/models/gemma3/modeling_gemma3.py +14 -84
- transformers/models/gemma3/modular_gemma3.py +12 -81
- transformers/models/gemma3n/modeling_gemma3n.py +18 -209
- transformers/models/gemma3n/modular_gemma3n.py +17 -59
- transformers/models/git/modeling_git.py +2 -0
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/modeling_glm4v.py +3 -3
- transformers/models/glm4v/modular_glm4v.py +6 -4
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
- transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/gpt2/modeling_gpt2.py +5 -1
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
- transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
- transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
- transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
- transformers/models/gptj/modeling_gptj.py +3 -0
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granitemoe/modeling_granitemoe.py +4 -6
- transformers/models/granitemoe/modular_granitemoe.py +0 -2
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
- transformers/models/groupvit/modeling_groupvit.py +3 -0
- transformers/models/helium/modeling_helium.py +4 -3
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +3 -0
- transformers/models/hubert/modular_hubert.py +1 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
- transformers/models/ibert/modeling_ibert.py +6 -0
- transformers/models/idefics/modeling_idefics.py +5 -21
- transformers/models/imagegpt/modeling_imagegpt.py +2 -1
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/internvl/modeling_internvl.py +2 -4
- transformers/models/internvl/modular_internvl.py +2 -4
- transformers/models/jamba/modeling_jamba.py +2 -2
- transformers/models/janus/modeling_janus.py +1 -0
- transformers/models/janus/modular_janus.py +1 -0
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/kosmos2/modeling_kosmos2.py +1 -0
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +244 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +729 -0
- transformers/models/lasr/modular_lasr.py +569 -0
- transformers/models/lasr/processing_lasr.py +96 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +5 -0
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +3 -0
- transformers/models/lfm2/modeling_lfm2.py +4 -5
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +4 -0
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/modeling_llama4.py +3 -2
- transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
- transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -0
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +4 -0
- transformers/models/mamba/modeling_mamba.py +14 -22
- transformers/models/marian/modeling_marian.py +5 -0
- transformers/models/markuplm/modeling_markuplm.py +4 -0
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/modeling_mask2former.py +2 -0
- transformers/models/maskformer/modeling_maskformer.py +2 -0
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +3 -1
- transformers/models/minimax/modeling_minimax.py +4 -4
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +4 -3
- transformers/models/mistral/modeling_mistral.py +4 -3
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mllama/modeling_mllama.py +2 -2
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/modeling_mobilevit.py +3 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
- transformers/models/modernbert/modeling_modernbert.py +4 -1
- transformers/models/modernbert/modular_modernbert.py +2 -0
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
- transformers/models/moonshine/modeling_moonshine.py +4 -2
- transformers/models/moshi/modeling_moshi.py +5 -2
- transformers/models/mpnet/modeling_mpnet.py +5 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +6 -0
- transformers/models/mt5/modeling_mt5.py +7 -0
- transformers/models/musicgen/modeling_musicgen.py +2 -0
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nemotron/modeling_nemotron.py +4 -2
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nougat/tokenization_nougat.py +11 -59
- transformers/models/nystromformer/modeling_nystromformer.py +6 -0
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +4 -5
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
- transformers/models/oneformer/modeling_oneformer.py +4 -1
- transformers/models/openai/modeling_openai.py +3 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/owlv2/modeling_owlv2.py +4 -0
- transformers/models/owlvit/modeling_owlvit.py +4 -0
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +9 -6
- transformers/models/parakeet/modular_parakeet.py +2 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
- transformers/models/patchtst/modeling_patchtst.py +20 -2
- transformers/models/pegasus/modeling_pegasus.py +5 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
- transformers/models/perceiver/modeling_perceiver.py +8 -0
- transformers/models/persimmon/modeling_persimmon.py +2 -1
- transformers/models/phi/modeling_phi.py +4 -5
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +2 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
- transformers/models/phimoe/modeling_phimoe.py +4 -4
- transformers/models/phimoe/modular_phimoe.py +2 -2
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pixtral/modeling_pixtral.py +2 -1
- transformers/models/plbart/modeling_plbart.py +6 -0
- transformers/models/plbart/modular_plbart.py +2 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/modeling_poolformer.py +2 -0
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +3 -0
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
- transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
- transformers/models/rag/modeling_rag.py +1 -0
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
- transformers/models/reformer/modeling_reformer.py +4 -0
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +6 -1
- transformers/models/rembert/modeling_rembert.py +6 -0
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +11 -2
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/modeling_rt_detr.py +2 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
- transformers/models/rwkv/modeling_rwkv.py +1 -0
- transformers/models/sam2/modeling_sam2.py +2 -2
- transformers/models/sam2/modular_sam2.py +2 -2
- transformers/models/sam2_video/modeling_sam2_video.py +1 -0
- transformers/models/sam2_video/modular_sam2_video.py +1 -0
- transformers/models/sam3/modeling_sam3.py +77 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
- transformers/models/sam3_video/modeling_sam3_video.py +1 -0
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
- transformers/models/seed_oss/modeling_seed_oss.py +2 -2
- transformers/models/segformer/modeling_segformer.py +4 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/siglip2/modeling_siglip2.py +4 -0
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
- transformers/models/speecht5/modeling_speecht5.py +13 -1
- transformers/models/splinter/modeling_splinter.py +3 -0
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +6 -0
- transformers/models/stablelm/modeling_stablelm.py +3 -1
- transformers/models/starcoder2/modeling_starcoder2.py +4 -3
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +2 -0
- transformers/models/swin/modeling_swin.py +4 -0
- transformers/models/swin2sr/modeling_swin2sr.py +2 -0
- transformers/models/swinv2/modeling_swinv2.py +4 -0
- transformers/models/t5/modeling_t5.py +7 -0
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +5 -5
- transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
- transformers/models/table_transformer/modeling_table_transformer.py +4 -0
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +2 -0
- transformers/models/timesfm/modular_timesfm.py +2 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
- transformers/models/trocr/modeling_trocr.py +2 -0
- transformers/models/tvp/modeling_tvp.py +2 -0
- transformers/models/udop/modeling_udop.py +4 -0
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/modeling_umt5.py +7 -0
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/vilt/modeling_vilt.py +6 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +6 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/modeling_vitmatte.py +1 -0
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/modeling_whisper.py +6 -0
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +3 -0
- transformers/models/xglm/modeling_xglm.py +1 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +5 -0
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/yoso/modeling_yoso.py +6 -0
- transformers/models/zamba/modeling_zamba.py +2 -0
- transformers/models/zamba2/modeling_zamba2.py +4 -2
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/modeling_zoedepth.py +1 -0
- transformers/pipelines/__init__.py +2 -3
- transformers/pipelines/base.py +1 -9
- transformers/pipelines/document_question_answering.py +3 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/processing_utils.py +23 -11
- transformers/quantizers/base.py +35 -110
- transformers/quantizers/quantizer_aqlm.py +1 -5
- transformers/quantizers/quantizer_auto_round.py +1 -2
- transformers/quantizers/quantizer_awq.py +17 -81
- transformers/quantizers/quantizer_bitnet.py +3 -8
- transformers/quantizers/quantizer_bnb_4bit.py +13 -110
- transformers/quantizers/quantizer_bnb_8bit.py +16 -92
- transformers/quantizers/quantizer_compressed_tensors.py +1 -5
- transformers/quantizers/quantizer_eetq.py +14 -62
- transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
- transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
- transformers/quantizers/quantizer_fp_quant.py +48 -78
- transformers/quantizers/quantizer_gptq.py +7 -24
- transformers/quantizers/quantizer_higgs.py +40 -54
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +13 -167
- transformers/quantizers/quantizer_quanto.py +20 -64
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +1 -4
- transformers/quantizers/quantizer_torchao.py +23 -202
- transformers/quantizers/quantizer_vptq.py +8 -22
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +297 -36
- transformers/tokenization_mistral_common.py +4 -0
- transformers/tokenization_utils_base.py +113 -222
- transformers/tokenization_utils_tokenizers.py +168 -107
- transformers/trainer.py +28 -31
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +66 -28
- transformers/utils/__init__.py +3 -4
- transformers/utils/auto_docstring.py +1 -0
- transformers/utils/generic.py +27 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +61 -16
- transformers/utils/kernel_config.py +4 -2
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +75 -242
- transformers/video_processing_utils.py +1 -2
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,7 @@ import torch.nn as nn
|
|
|
13
13
|
from ...activations import ACT2FN
|
|
14
14
|
from ...cache_utils import Cache, DynamicCache
|
|
15
15
|
from ...generation import GenerationMixin
|
|
16
|
-
from ...integrations import use_kernel_func_from_hub
|
|
16
|
+
from ...integrations import use_kernel_func_from_hub, use_kernelized_func
|
|
17
17
|
from ...masking_utils import create_causal_mask
|
|
18
18
|
from ...modeling_layers import (
|
|
19
19
|
GenericForSequenceClassification,
|
|
@@ -25,7 +25,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
25
25
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
26
26
|
from ...processing_utils import Unpack
|
|
27
27
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
|
28
|
-
from ...utils.generic import check_model_inputs
|
|
28
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
29
29
|
from .configuration_phi import PhiConfig
|
|
30
30
|
|
|
31
31
|
|
|
@@ -90,7 +90,7 @@ class PhiRotaryEmbedding(nn.Module):
|
|
|
90
90
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
91
91
|
|
|
92
92
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
93
|
-
with
|
|
93
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
94
94
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
95
95
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
96
96
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -172,6 +172,7 @@ def eager_attention_forward(
|
|
|
172
172
|
return attn_output, attn_weights
|
|
173
173
|
|
|
174
174
|
|
|
175
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
175
176
|
class PhiAttention(nn.Module):
|
|
176
177
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
177
178
|
|
|
@@ -187,7 +188,6 @@ class PhiAttention(nn.Module):
|
|
|
187
188
|
self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
|
|
188
189
|
self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
|
|
189
190
|
self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
|
|
190
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
191
191
|
self.dense = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=True)
|
|
192
192
|
self.rotary_ndims = int(self.head_dim * config.rope_parameters["partial_rotary_factor"])
|
|
193
193
|
self.qk_layernorm = config.qk_layernorm
|
|
@@ -206,7 +206,6 @@ class PhiAttention(nn.Module):
|
|
|
206
206
|
attention_mask: Optional[torch.Tensor],
|
|
207
207
|
past_key_values: Optional[Cache] = None,
|
|
208
208
|
cache_position: Optional[torch.LongTensor] = None,
|
|
209
|
-
position_ids: Optional[torch.LongTensor] = None,
|
|
210
209
|
**kwargs,
|
|
211
210
|
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
|
212
211
|
input_shape = hidden_states.shape[:-1]
|
|
@@ -92,7 +92,6 @@ class PhiAttention(LlamaAttention):
|
|
|
92
92
|
attention_mask: Optional[torch.Tensor],
|
|
93
93
|
past_key_values: Optional[Cache] = None,
|
|
94
94
|
cache_position: Optional[torch.LongTensor] = None,
|
|
95
|
-
position_ids: Optional[torch.LongTensor] = None,
|
|
96
95
|
**kwargs,
|
|
97
96
|
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
|
98
97
|
input_shape = hidden_states.shape[:-1]
|
|
@@ -44,6 +44,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
44
44
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
45
45
|
from ...processing_utils import Unpack
|
|
46
46
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
47
|
+
from ...utils.generic import maybe_autocast
|
|
47
48
|
from .configuration_phi3 import Phi3Config
|
|
48
49
|
|
|
49
50
|
|
|
@@ -123,7 +124,7 @@ class Phi3RotaryEmbedding(nn.Module):
|
|
|
123
124
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
124
125
|
|
|
125
126
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
126
|
-
with
|
|
127
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
127
128
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
128
129
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
129
130
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -47,7 +47,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
47
47
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
48
48
|
from ...processing_utils import Unpack
|
|
49
49
|
from ...utils import auto_docstring, can_return_tuple, torch_int
|
|
50
|
-
from ...utils.generic import TransformersKwargs, check_model_inputs
|
|
50
|
+
from ...utils.generic import TransformersKwargs, check_model_inputs, maybe_autocast
|
|
51
51
|
from .configuration_phi4_multimodal import Phi4MultimodalAudioConfig, Phi4MultimodalConfig, Phi4MultimodalVisionConfig
|
|
52
52
|
|
|
53
53
|
|
|
@@ -602,7 +602,7 @@ class Phi4MultimodalImageEmbedding(nn.Module):
|
|
|
602
602
|
|
|
603
603
|
# Temporarily disable autocast to avoid issue on bf16 tensors
|
|
604
604
|
# Ref: https://github.com/pytorch/pytorch/issues/132715
|
|
605
|
-
with
|
|
605
|
+
with maybe_autocast(device_type=inputs_embeds.device.type, enabled=False):
|
|
606
606
|
image_embeds = inputs_embeds.index_put(
|
|
607
607
|
indices=positions_tuple, values=merged_img_set_tensor, accumulate=False
|
|
608
608
|
)
|
|
@@ -1014,7 +1014,7 @@ class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
|
|
|
1014
1014
|
pad_mask = pad_mask & enc_streaming_mask
|
|
1015
1015
|
return pad_mask
|
|
1016
1016
|
|
|
1017
|
-
def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor]):
|
|
1017
|
+
def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor], **kwargs):
|
|
1018
1018
|
hidden_states = self.encoder_embedding(hidden_states)
|
|
1019
1019
|
hidden_states, hs_mask, mask = self.forward_embeddings(hidden_states, mask)
|
|
1020
1020
|
|
|
@@ -1116,7 +1116,7 @@ class Phi4MultimodalAudioEmbedding(nn.Module):
|
|
|
1116
1116
|
merged_audio_embeds = merged_audio_embeds.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
|
|
1117
1117
|
# Temporarily disable autocast to avoid issue on bf16 tensors
|
|
1118
1118
|
# Ref: https://github.com/pytorch/pytorch/issues/132715
|
|
1119
|
-
with
|
|
1119
|
+
with maybe_autocast(device_type=inputs_embeds.device.type, enabled=False):
|
|
1120
1120
|
audio_embeds = inputs_embeds.index_put(
|
|
1121
1121
|
indices=positions_tuple, values=merged_audio_embeds, accumulate=False
|
|
1122
1122
|
)
|
|
@@ -1500,7 +1500,7 @@ class Phi4MultimodalRotaryEmbedding(nn.Module):
|
|
|
1500
1500
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
1501
1501
|
|
|
1502
1502
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
1503
|
-
with
|
|
1503
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
1504
1504
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
1505
1505
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
1506
1506
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -37,7 +37,7 @@ from ...modeling_rope_utils import RopeParameters
|
|
|
37
37
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
38
38
|
from ...processing_utils import Unpack
|
|
39
39
|
from ...utils import auto_docstring, logging
|
|
40
|
-
from ...utils.generic import TransformersKwargs, check_model_inputs
|
|
40
|
+
from ...utils.generic import TransformersKwargs, check_model_inputs, maybe_autocast
|
|
41
41
|
from ..phi3.configuration_phi3 import Phi3Config
|
|
42
42
|
from ..phi3.modeling_phi3 import (
|
|
43
43
|
Phi3DecoderLayer,
|
|
@@ -844,7 +844,7 @@ class Phi4MultimodalImageEmbedding(nn.Module):
|
|
|
844
844
|
|
|
845
845
|
# Temporarily disable autocast to avoid issue on bf16 tensors
|
|
846
846
|
# Ref: https://github.com/pytorch/pytorch/issues/132715
|
|
847
|
-
with
|
|
847
|
+
with maybe_autocast(device_type=inputs_embeds.device.type, enabled=False):
|
|
848
848
|
image_embeds = inputs_embeds.index_put(
|
|
849
849
|
indices=positions_tuple, values=merged_img_set_tensor, accumulate=False
|
|
850
850
|
)
|
|
@@ -1205,7 +1205,7 @@ class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
|
|
|
1205
1205
|
pad_mask = pad_mask & enc_streaming_mask
|
|
1206
1206
|
return pad_mask
|
|
1207
1207
|
|
|
1208
|
-
def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor]):
|
|
1208
|
+
def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor], **kwargs):
|
|
1209
1209
|
hidden_states = self.encoder_embedding(hidden_states)
|
|
1210
1210
|
hidden_states, hs_mask, mask = self.forward_embeddings(hidden_states, mask)
|
|
1211
1211
|
|
|
@@ -1358,7 +1358,7 @@ class Phi4MultimodalAudioEmbedding(nn.Module):
|
|
|
1358
1358
|
merged_audio_embeds = merged_audio_embeds.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
|
|
1359
1359
|
# Temporarily disable autocast to avoid issue on bf16 tensors
|
|
1360
1360
|
# Ref: https://github.com/pytorch/pytorch/issues/132715
|
|
1361
|
-
with
|
|
1361
|
+
with maybe_autocast(device_type=inputs_embeds.device.type, enabled=False):
|
|
1362
1362
|
audio_embeds = inputs_embeds.index_put(
|
|
1363
1363
|
indices=positions_tuple, values=merged_audio_embeds, accumulate=False
|
|
1364
1364
|
)
|
|
@@ -30,7 +30,7 @@ from ... import initialization as init
|
|
|
30
30
|
from ...activations import ACT2FN
|
|
31
31
|
from ...cache_utils import Cache, DynamicCache
|
|
32
32
|
from ...generation import GenerationMixin
|
|
33
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
|
|
33
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
34
34
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
35
35
|
from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
|
|
36
36
|
from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
|
|
@@ -38,7 +38,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
38
38
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
39
39
|
from ...processing_utils import Unpack
|
|
40
40
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
41
|
-
from ...utils.generic import OutputRecorder, check_model_inputs
|
|
41
|
+
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
|
|
42
42
|
from .configuration_phimoe import PhimoeConfig
|
|
43
43
|
|
|
44
44
|
|
|
@@ -113,7 +113,7 @@ class PhimoeRotaryEmbedding(nn.Module):
|
|
|
113
113
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
114
114
|
|
|
115
115
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
116
|
-
with
|
|
116
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
117
117
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
118
118
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
119
119
|
cos = emb.cos() * mscale
|
|
@@ -194,6 +194,7 @@ def eager_attention_forward(
|
|
|
194
194
|
return attn_output, attn_weights
|
|
195
195
|
|
|
196
196
|
|
|
197
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
197
198
|
class PhimoeAttention(nn.Module):
|
|
198
199
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
199
200
|
|
|
@@ -219,7 +220,6 @@ class PhimoeAttention(nn.Module):
|
|
|
219
220
|
self.o_proj = nn.Linear(
|
|
220
221
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
221
222
|
)
|
|
222
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
223
223
|
|
|
224
224
|
def forward(
|
|
225
225
|
self,
|
|
@@ -24,7 +24,7 @@ from ...modeling_layers import (
|
|
|
24
24
|
GenericForSequenceClassification,
|
|
25
25
|
)
|
|
26
26
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
|
|
27
|
-
from ...utils.generic import OutputRecorder
|
|
27
|
+
from ...utils.generic import OutputRecorder, maybe_autocast
|
|
28
28
|
from ..llama.modeling_llama import LlamaAttention
|
|
29
29
|
from ..mixtral.modeling_mixtral import (
|
|
30
30
|
MixtralDecoderLayer,
|
|
@@ -74,7 +74,7 @@ class PhimoeRotaryEmbedding(MixtralRotaryEmbedding):
|
|
|
74
74
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
75
75
|
|
|
76
76
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
77
|
-
with
|
|
77
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
78
78
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
79
79
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
80
80
|
cos = emb.cos() * mscale
|
|
@@ -481,6 +481,7 @@ class Pix2StructVisionModel(Pix2StructPreTrainedModel):
|
|
|
481
481
|
output_attentions: Optional[bool] = None,
|
|
482
482
|
output_hidden_states: Optional[bool] = None,
|
|
483
483
|
return_dict: Optional[bool] = None,
|
|
484
|
+
**kwargs,
|
|
484
485
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
485
486
|
r"""
|
|
486
487
|
flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
|
|
@@ -1359,6 +1360,7 @@ class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel, GenerationMi
|
|
|
1359
1360
|
output_hidden_states: Optional[bool] = None,
|
|
1360
1361
|
return_dict: Optional[bool] = None,
|
|
1361
1362
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1363
|
+
**kwargs,
|
|
1362
1364
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
|
|
1363
1365
|
r"""
|
|
1364
1366
|
flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
|
|
@@ -28,6 +28,7 @@ from ...modeling_rope_utils import dynamic_rope_update
|
|
|
28
28
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
29
29
|
from ...processing_utils import Unpack
|
|
30
30
|
from ...utils import auto_docstring, can_return_tuple, logging
|
|
31
|
+
from ...utils.generic import maybe_autocast
|
|
31
32
|
from .configuration_pixtral import PixtralVisionConfig
|
|
32
33
|
|
|
33
34
|
|
|
@@ -125,7 +126,7 @@ class PixtralRotaryEmbedding(nn.Module):
|
|
|
125
126
|
def forward(self, x, position_ids):
|
|
126
127
|
freqs = self.inv_freq[position_ids]
|
|
127
128
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
128
|
-
with
|
|
129
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
129
130
|
emb = freqs
|
|
130
131
|
cos = emb.cos()
|
|
131
132
|
sin = emb.sin()
|
|
@@ -366,6 +366,7 @@ class PLBartEncoder(PLBartPreTrainedModel):
|
|
|
366
366
|
output_attentions: Optional[bool] = None,
|
|
367
367
|
output_hidden_states: Optional[bool] = None,
|
|
368
368
|
return_dict: Optional[bool] = None,
|
|
369
|
+
**kwargs,
|
|
369
370
|
) -> Union[tuple, BaseModelOutput]:
|
|
370
371
|
r"""
|
|
371
372
|
Args:
|
|
@@ -621,6 +622,7 @@ class PLBartDecoder(PLBartPreTrainedModel):
|
|
|
621
622
|
output_hidden_states: Optional[bool] = None,
|
|
622
623
|
return_dict: Optional[bool] = None,
|
|
623
624
|
cache_position: Optional[torch.LongTensor] = None,
|
|
625
|
+
**kwargs,
|
|
624
626
|
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
|
625
627
|
r"""
|
|
626
628
|
Args:
|
|
@@ -867,6 +869,7 @@ class PLBartModel(PLBartPreTrainedModel):
|
|
|
867
869
|
output_hidden_states: Optional[bool] = None,
|
|
868
870
|
return_dict: Optional[bool] = None,
|
|
869
871
|
cache_position: Optional[torch.LongTensor] = None,
|
|
872
|
+
**kwargs,
|
|
870
873
|
) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
|
|
871
874
|
r"""
|
|
872
875
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1002,6 +1005,7 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel, GenerationMixin):
|
|
|
1002
1005
|
output_hidden_states: Optional[bool] = None,
|
|
1003
1006
|
return_dict: Optional[bool] = None,
|
|
1004
1007
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1008
|
+
**kwargs,
|
|
1005
1009
|
) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
|
|
1006
1010
|
r"""
|
|
1007
1011
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1159,6 +1163,7 @@ class PLBartForSequenceClassification(PLBartPreTrainedModel):
|
|
|
1159
1163
|
output_hidden_states: Optional[bool] = None,
|
|
1160
1164
|
return_dict: Optional[bool] = None,
|
|
1161
1165
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1166
|
+
**kwargs,
|
|
1162
1167
|
) -> Union[tuple, Seq2SeqSequenceClassifierOutput]:
|
|
1163
1168
|
r"""
|
|
1164
1169
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1316,6 +1321,7 @@ class PLBartForCausalLM(PLBartPreTrainedModel, GenerationMixin):
|
|
|
1316
1321
|
return_dict: Optional[bool] = None,
|
|
1317
1322
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1318
1323
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
1324
|
+
**kwargs,
|
|
1319
1325
|
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
|
|
1320
1326
|
r"""
|
|
1321
1327
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -108,6 +108,7 @@ class PLBartModel(PLBartPreTrainedModel):
|
|
|
108
108
|
output_hidden_states: Optional[bool] = None,
|
|
109
109
|
return_dict: Optional[bool] = None,
|
|
110
110
|
cache_position: Optional[torch.LongTensor] = None,
|
|
111
|
+
**kwargs,
|
|
111
112
|
) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
|
|
112
113
|
r"""
|
|
113
114
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -243,6 +244,7 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel, GenerationMixin):
|
|
|
243
244
|
output_hidden_states: Optional[bool] = None,
|
|
244
245
|
return_dict: Optional[bool] = None,
|
|
245
246
|
cache_position: Optional[torch.LongTensor] = None,
|
|
247
|
+
**kwargs,
|
|
246
248
|
) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
|
|
247
249
|
r"""
|
|
248
250
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -125,7 +125,6 @@ class PLBartTokenizer(SentencePieceBackend):
|
|
|
125
125
|
pad_token="<pad>",
|
|
126
126
|
mask_token="<mask>",
|
|
127
127
|
language_codes="base",
|
|
128
|
-
tokenizer_file=None,
|
|
129
128
|
src_lang=None,
|
|
130
129
|
tgt_lang=None,
|
|
131
130
|
sp_model_kwargs: Optional[dict[str, Any]] = None,
|
|
@@ -171,7 +170,6 @@ class PLBartTokenizer(SentencePieceBackend):
|
|
|
171
170
|
cls_token=cls_token,
|
|
172
171
|
pad_token=pad_token,
|
|
173
172
|
mask_token=mask_token,
|
|
174
|
-
tokenizer_file=tokenizer_file,
|
|
175
173
|
src_lang=src_lang,
|
|
176
174
|
tgt_lang=tgt_lang,
|
|
177
175
|
additional_special_tokens=_additional_special_tokens,
|
|
@@ -276,6 +276,7 @@ class PoolFormerModel(PoolFormerPreTrainedModel):
|
|
|
276
276
|
pixel_values: Optional[torch.FloatTensor] = None,
|
|
277
277
|
output_hidden_states: Optional[bool] = None,
|
|
278
278
|
return_dict: Optional[bool] = None,
|
|
279
|
+
**kwargs,
|
|
279
280
|
) -> Union[tuple, BaseModelOutputWithNoAttention]:
|
|
280
281
|
output_hidden_states = (
|
|
281
282
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
@@ -339,6 +340,7 @@ class PoolFormerForImageClassification(PoolFormerPreTrainedModel):
|
|
|
339
340
|
labels: Optional[torch.LongTensor] = None,
|
|
340
341
|
output_hidden_states: Optional[bool] = None,
|
|
341
342
|
return_dict: Optional[bool] = None,
|
|
343
|
+
**kwargs,
|
|
342
344
|
) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
|
|
343
345
|
r"""
|
|
344
346
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -644,6 +644,7 @@ class Pop2PianoStack(Pop2PianoPreTrainedModel):
|
|
|
644
644
|
output_hidden_states=None,
|
|
645
645
|
return_dict=None,
|
|
646
646
|
cache_position=None,
|
|
647
|
+
**kwargs,
|
|
647
648
|
):
|
|
648
649
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
|
649
650
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
@@ -1051,6 +1052,7 @@ class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel, GenerationMixi
|
|
|
1051
1052
|
output_hidden_states: Optional[bool] = None,
|
|
1052
1053
|
return_dict: Optional[bool] = None,
|
|
1053
1054
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1055
|
+
**kwargs,
|
|
1054
1056
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
|
|
1055
1057
|
r"""
|
|
1056
1058
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -393,6 +393,7 @@ class PromptDepthAnythingForDepthEstimation(PromptDepthAnythingPreTrainedModel):
|
|
|
393
393
|
output_attentions: Optional[bool] = None,
|
|
394
394
|
output_hidden_states: Optional[bool] = None,
|
|
395
395
|
return_dict: Optional[bool] = None,
|
|
396
|
+
**kwargs,
|
|
396
397
|
) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]:
|
|
397
398
|
r"""
|
|
398
399
|
prompt_depth (`torch.FloatTensor` of shape `(batch_size, 1, height, width)`, *optional*):
|
|
@@ -236,6 +236,7 @@ class PromptDepthAnythingForDepthEstimation(DepthAnythingForDepthEstimation):
|
|
|
236
236
|
output_attentions: Optional[bool] = None,
|
|
237
237
|
output_hidden_states: Optional[bool] = None,
|
|
238
238
|
return_dict: Optional[bool] = None,
|
|
239
|
+
**kwargs,
|
|
239
240
|
) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]:
|
|
240
241
|
r"""
|
|
241
242
|
prompt_depth (`torch.FloatTensor` of shape `(batch_size, 1, height, width)`, *optional*):
|
|
@@ -993,6 +993,7 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
|
|
|
993
993
|
output_attentions: Optional[bool] = None,
|
|
994
994
|
output_hidden_states: Optional[bool] = None,
|
|
995
995
|
return_dict: Optional[bool] = None,
|
|
996
|
+
**kwargs,
|
|
996
997
|
) -> Union[tuple, BaseModelOutput]:
|
|
997
998
|
r"""
|
|
998
999
|
Example:
|
|
@@ -1113,6 +1114,7 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
|
|
|
1113
1114
|
output_hidden_states: Optional[bool] = None,
|
|
1114
1115
|
return_dict: Optional[bool] = None,
|
|
1115
1116
|
cache_position: Optional[torch.Tensor] = None,
|
|
1117
|
+
**kwargs,
|
|
1116
1118
|
) -> Union[tuple, ProphetNetDecoderModelOutput]:
|
|
1117
1119
|
r"""
|
|
1118
1120
|
Example:
|
|
@@ -1416,6 +1418,7 @@ class ProphetNetModel(ProphetNetPreTrainedModel):
|
|
|
1416
1418
|
output_hidden_states: Optional[bool] = None,
|
|
1417
1419
|
return_dict: Optional[bool] = None,
|
|
1418
1420
|
cache_position: Optional[torch.Tensor] = None,
|
|
1421
|
+
**kwargs,
|
|
1419
1422
|
) -> Union[tuple, ProphetNetSeq2SeqModelOutput]:
|
|
1420
1423
|
r"""
|
|
1421
1424
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -458,6 +458,7 @@ class PvtModel(PvtPreTrainedModel):
|
|
|
458
458
|
output_attentions: Optional[bool] = None,
|
|
459
459
|
output_hidden_states: Optional[bool] = None,
|
|
460
460
|
return_dict: Optional[bool] = None,
|
|
461
|
+
**kwargs,
|
|
461
462
|
) -> Union[tuple, BaseModelOutput]:
|
|
462
463
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
463
464
|
output_hidden_states = (
|
|
@@ -512,6 +513,7 @@ class PvtForImageClassification(PvtPreTrainedModel):
|
|
|
512
513
|
output_attentions: Optional[bool] = None,
|
|
513
514
|
output_hidden_states: Optional[bool] = None,
|
|
514
515
|
return_dict: Optional[bool] = None,
|
|
516
|
+
**kwargs,
|
|
515
517
|
) -> Union[tuple, ImageClassifierOutput]:
|
|
516
518
|
r"""
|
|
517
519
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -406,6 +406,7 @@ class PvtV2Model(PvtV2PreTrainedModel):
|
|
|
406
406
|
output_attentions: Optional[bool] = None,
|
|
407
407
|
output_hidden_states: Optional[bool] = None,
|
|
408
408
|
return_dict: Optional[bool] = None,
|
|
409
|
+
**kwargs,
|
|
409
410
|
) -> Union[tuple, BaseModelOutput]:
|
|
410
411
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
411
412
|
output_hidden_states = (
|
|
@@ -460,6 +461,7 @@ class PvtV2ForImageClassification(PvtV2PreTrainedModel):
|
|
|
460
461
|
output_attentions: Optional[bool] = None,
|
|
461
462
|
output_hidden_states: Optional[bool] = None,
|
|
462
463
|
return_dict: Optional[bool] = None,
|
|
464
|
+
**kwargs,
|
|
463
465
|
) -> Union[tuple, ImageClassifierOutput]:
|
|
464
466
|
r"""
|
|
465
467
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -523,6 +525,7 @@ class PvtV2Backbone(PvtV2Model, BackboneMixin):
|
|
|
523
525
|
output_attentions: Optional[bool] = None,
|
|
524
526
|
output_hidden_states: Optional[bool] = None,
|
|
525
527
|
return_dict: Optional[bool] = None,
|
|
528
|
+
**kwargs,
|
|
526
529
|
) -> BackboneOutput:
|
|
527
530
|
r"""
|
|
528
531
|
Examples:
|
|
@@ -13,7 +13,7 @@ from torch import nn
|
|
|
13
13
|
from ...activations import ACT2FN
|
|
14
14
|
from ...cache_utils import Cache, DynamicCache
|
|
15
15
|
from ...generation import GenerationMixin
|
|
16
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
|
|
16
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
17
17
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
18
18
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
19
19
|
from ...modeling_layers import (
|
|
@@ -27,7 +27,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
27
27
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
28
28
|
from ...processing_utils import Unpack
|
|
29
29
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
30
|
-
from ...utils.generic import check_model_inputs
|
|
30
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
31
31
|
from .configuration_qwen2 import Qwen2Config
|
|
32
32
|
|
|
33
33
|
|
|
@@ -103,7 +103,7 @@ class Qwen2RotaryEmbedding(nn.Module):
|
|
|
103
103
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
104
104
|
|
|
105
105
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
106
|
-
with
|
|
106
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
107
107
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
108
108
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
109
109
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -185,6 +185,7 @@ def eager_attention_forward(
|
|
|
185
185
|
return attn_output, attn_weights
|
|
186
186
|
|
|
187
187
|
|
|
188
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
188
189
|
class Qwen2Attention(nn.Module):
|
|
189
190
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
190
191
|
|
|
@@ -202,7 +203,6 @@ class Qwen2Attention(nn.Module):
|
|
|
202
203
|
self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
|
|
203
204
|
self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
|
|
204
205
|
self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
|
|
205
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
206
206
|
self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None
|
|
207
207
|
|
|
208
208
|
def forward(
|
|
@@ -14,10 +14,11 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""Tokenization classes for Qwen2."""
|
|
16
16
|
|
|
17
|
+
from typing import Optional, Union
|
|
18
|
+
|
|
17
19
|
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers
|
|
18
20
|
from tokenizers.models import BPE
|
|
19
21
|
|
|
20
|
-
from ...tokenization_utils_base import generate_merges
|
|
21
22
|
from ...tokenization_utils_tokenizers import TokenizersBackend
|
|
22
23
|
from ...utils import logging
|
|
23
24
|
|
|
@@ -38,33 +39,30 @@ PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p
|
|
|
38
39
|
class Qwen2Tokenizer(TokenizersBackend):
|
|
39
40
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
40
41
|
model_input_names = ["input_ids", "attention_mask"]
|
|
41
|
-
|
|
42
|
+
model = BPE
|
|
42
43
|
|
|
43
44
|
def __init__(
|
|
44
45
|
self,
|
|
46
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
47
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
45
48
|
vocab_file=None,
|
|
46
49
|
merges_file=None,
|
|
47
|
-
unk_token="<|endoftext|>",
|
|
50
|
+
unk_token: str = "<|endoftext|>",
|
|
48
51
|
bos_token=None,
|
|
49
|
-
eos_token="<|endoftext|>",
|
|
50
|
-
pad_token="<|endoftext|>",
|
|
52
|
+
eos_token: str = "<|endoftext|>",
|
|
53
|
+
pad_token: str = "<|endoftext|>",
|
|
51
54
|
add_prefix_space=None,
|
|
52
|
-
vocab=None,
|
|
53
|
-
merges=None,
|
|
54
55
|
**kwargs,
|
|
55
56
|
):
|
|
56
57
|
self.add_prefix_space = add_prefix_space if add_prefix_space is not None else False
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
)
|
|
62
|
-
else:
|
|
63
|
-
self._vocab = {
|
|
58
|
+
self._vocab = (
|
|
59
|
+
vocab
|
|
60
|
+
if vocab is not None
|
|
61
|
+
else {
|
|
64
62
|
"<|endoftext|>": 0,
|
|
65
63
|
}
|
|
66
|
-
|
|
67
|
-
|
|
64
|
+
)
|
|
65
|
+
self._merges = merges or []
|
|
68
66
|
self._tokenizer = Tokenizer(
|
|
69
67
|
BPE(
|
|
70
68
|
vocab=self._vocab,
|
|
@@ -92,12 +90,10 @@ class Qwen2Tokenizer(TokenizersBackend):
|
|
|
92
90
|
),
|
|
93
91
|
]
|
|
94
92
|
)
|
|
95
|
-
tokenizer_object = self._tokenizer
|
|
96
93
|
|
|
97
94
|
super().__init__(
|
|
98
95
|
vocab_file=vocab_file,
|
|
99
96
|
merges_file=merges_file,
|
|
100
|
-
tokenizer_object=tokenizer_object,
|
|
101
97
|
unk_token=unk_token,
|
|
102
98
|
bos_token=bos_token,
|
|
103
99
|
eos_token=eos_token,
|
|
@@ -365,7 +365,7 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig):
|
|
|
365
365
|
self.rope_parameters = rope_parameters
|
|
366
366
|
super().__init__(
|
|
367
367
|
tie_word_embeddings=tie_word_embeddings,
|
|
368
|
-
ignore_keys_at_rope_validation={"
|
|
368
|
+
ignore_keys_at_rope_validation={"mrope_section"},
|
|
369
369
|
**kwargs,
|
|
370
370
|
)
|
|
371
371
|
|
|
@@ -713,7 +713,9 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig):
|
|
|
713
713
|
layer_type_validation(self.layer_types, self.num_hidden_layers)
|
|
714
714
|
|
|
715
715
|
self.rope_parameters = rope_parameters
|
|
716
|
-
super().__init__(
|
|
716
|
+
super().__init__(
|
|
717
|
+
tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
|
|
718
|
+
)
|
|
717
719
|
|
|
718
720
|
|
|
719
721
|
class Qwen2_5OmniDiTConfig(PreTrainedConfig):
|