transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +30 -3
- transformers/cli/serve.py +47 -17
- transformers/conversion_mapping.py +15 -2
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +196 -135
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +1 -2
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +1 -2
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/configuration_utils.py +3 -2
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/continuous_api.py +134 -79
- transformers/image_processing_base.py +1 -2
- transformers/integrations/__init__.py +4 -2
- transformers/integrations/accelerate.py +15 -3
- transformers/integrations/aqlm.py +38 -66
- transformers/integrations/awq.py +48 -514
- transformers/integrations/bitnet.py +45 -100
- transformers/integrations/bitsandbytes.py +79 -191
- transformers/integrations/deepspeed.py +1 -0
- transformers/integrations/eetq.py +84 -79
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +236 -193
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +40 -62
- transformers/integrations/hub_kernels.py +42 -3
- transformers/integrations/integration_utils.py +10 -0
- transformers/integrations/mxfp4.py +25 -65
- transformers/integrations/peft.py +7 -29
- transformers/integrations/quanto.py +73 -55
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +44 -90
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +42 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +8 -0
- transformers/modeling_rope_utils.py +30 -6
- transformers/modeling_utils.py +116 -112
- transformers/models/__init__.py +3 -0
- transformers/models/afmoe/modeling_afmoe.py +4 -4
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +2 -0
- transformers/models/altclip/modeling_altclip.py +4 -0
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/modeling_aria.py +4 -4
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/auto/configuration_auto.py +11 -0
- transformers/models/auto/feature_extraction_auto.py +2 -0
- transformers/models/auto/image_processing_auto.py +1 -0
- transformers/models/auto/modeling_auto.py +6 -0
- transformers/models/auto/processing_auto.py +18 -10
- transformers/models/auto/tokenization_auto.py +74 -472
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/bamba/modeling_bamba.py +4 -3
- transformers/models/bark/modeling_bark.py +2 -0
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/big_bird/modeling_big_bird.py +6 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +11 -2
- transformers/models/bitnet/modeling_bitnet.py +4 -4
- transformers/models/blenderbot/modeling_blenderbot.py +5 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
- transformers/models/blip/modeling_blip_text.py +2 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -1
- transformers/models/bloom/modeling_bloom.py +4 -0
- transformers/models/blt/modeling_blt.py +2 -2
- transformers/models/blt/modular_blt.py +2 -2
- transformers/models/bridgetower/modeling_bridgetower.py +5 -1
- transformers/models/bros/modeling_bros.py +4 -0
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +5 -0
- transformers/models/chameleon/modeling_chameleon.py +2 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
- transformers/models/clap/modeling_clap.py +5 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +5 -0
- transformers/models/clvp/modeling_clvp.py +5 -0
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +4 -3
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +7 -6
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
- transformers/models/convbert/modeling_convbert.py +6 -0
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/modeling_csm.py +4 -3
- transformers/models/ctrl/modeling_ctrl.py +1 -0
- transformers/models/cvt/modeling_cvt.py +2 -0
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/modeling_d_fine.py +2 -0
- transformers/models/d_fine/modular_d_fine.py +1 -0
- transformers/models/dab_detr/modeling_dab_detr.py +4 -0
- transformers/models/dac/modeling_dac.py +2 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/dbrx/modeling_dbrx.py +2 -2
- transformers/models/deberta/modeling_deberta.py +5 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
- transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
- transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
- transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/modeling_detr.py +5 -0
- transformers/models/dia/modeling_dia.py +4 -3
- transformers/models/dia/modular_dia.py +0 -1
- transformers/models/diffllama/modeling_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +2 -3
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +2 -0
- transformers/models/dots1/modeling_dots1.py +10 -7
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/edgetam/modeling_edgetam.py +1 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
- transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
- transformers/models/efficientnet/modeling_efficientnet.py +2 -0
- transformers/models/emu3/modeling_emu3.py +4 -4
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +14 -2
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
- transformers/models/esm/modeling_esmfold.py +5 -4
- transformers/models/evolla/modeling_evolla.py +4 -4
- transformers/models/exaone4/modeling_exaone4.py +2 -2
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +6 -1
- transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
- transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
- transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
- transformers/models/flaubert/modeling_flaubert.py +7 -0
- transformers/models/flava/modeling_flava.py +6 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
- transformers/models/florence2/modeling_florence2.py +2 -1
- transformers/models/florence2/modular_florence2.py +2 -1
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/processing_fuyu.py +3 -3
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +2 -1
- transformers/models/gemma3/modeling_gemma3.py +14 -84
- transformers/models/gemma3/modular_gemma3.py +12 -81
- transformers/models/gemma3n/modeling_gemma3n.py +18 -209
- transformers/models/gemma3n/modular_gemma3n.py +17 -59
- transformers/models/git/modeling_git.py +2 -0
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/modeling_glm4v.py +3 -3
- transformers/models/glm4v/modular_glm4v.py +6 -4
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
- transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/gpt2/modeling_gpt2.py +5 -1
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
- transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
- transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
- transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
- transformers/models/gptj/modeling_gptj.py +3 -0
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granitemoe/modeling_granitemoe.py +4 -6
- transformers/models/granitemoe/modular_granitemoe.py +0 -2
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
- transformers/models/groupvit/modeling_groupvit.py +3 -0
- transformers/models/helium/modeling_helium.py +4 -3
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +3 -0
- transformers/models/hubert/modular_hubert.py +1 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
- transformers/models/ibert/modeling_ibert.py +6 -0
- transformers/models/idefics/modeling_idefics.py +5 -21
- transformers/models/imagegpt/modeling_imagegpt.py +2 -1
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/internvl/modeling_internvl.py +2 -4
- transformers/models/internvl/modular_internvl.py +2 -4
- transformers/models/jamba/modeling_jamba.py +2 -2
- transformers/models/janus/modeling_janus.py +1 -0
- transformers/models/janus/modular_janus.py +1 -0
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/kosmos2/modeling_kosmos2.py +1 -0
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +244 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +729 -0
- transformers/models/lasr/modular_lasr.py +569 -0
- transformers/models/lasr/processing_lasr.py +96 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +5 -0
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +3 -0
- transformers/models/lfm2/modeling_lfm2.py +4 -5
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +4 -0
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/modeling_llama4.py +3 -2
- transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
- transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -0
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +4 -0
- transformers/models/mamba/modeling_mamba.py +14 -22
- transformers/models/marian/modeling_marian.py +5 -0
- transformers/models/markuplm/modeling_markuplm.py +4 -0
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/modeling_mask2former.py +2 -0
- transformers/models/maskformer/modeling_maskformer.py +2 -0
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +3 -1
- transformers/models/minimax/modeling_minimax.py +4 -4
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +4 -3
- transformers/models/mistral/modeling_mistral.py +4 -3
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mllama/modeling_mllama.py +2 -2
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/modeling_mobilevit.py +3 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
- transformers/models/modernbert/modeling_modernbert.py +4 -1
- transformers/models/modernbert/modular_modernbert.py +2 -0
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
- transformers/models/moonshine/modeling_moonshine.py +4 -2
- transformers/models/moshi/modeling_moshi.py +5 -2
- transformers/models/mpnet/modeling_mpnet.py +5 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +6 -0
- transformers/models/mt5/modeling_mt5.py +7 -0
- transformers/models/musicgen/modeling_musicgen.py +2 -0
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nemotron/modeling_nemotron.py +4 -2
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nougat/tokenization_nougat.py +11 -59
- transformers/models/nystromformer/modeling_nystromformer.py +6 -0
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +4 -5
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
- transformers/models/oneformer/modeling_oneformer.py +4 -1
- transformers/models/openai/modeling_openai.py +3 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/owlv2/modeling_owlv2.py +4 -0
- transformers/models/owlvit/modeling_owlvit.py +4 -0
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +9 -6
- transformers/models/parakeet/modular_parakeet.py +2 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
- transformers/models/patchtst/modeling_patchtst.py +20 -2
- transformers/models/pegasus/modeling_pegasus.py +5 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
- transformers/models/perceiver/modeling_perceiver.py +8 -0
- transformers/models/persimmon/modeling_persimmon.py +2 -1
- transformers/models/phi/modeling_phi.py +4 -5
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +2 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
- transformers/models/phimoe/modeling_phimoe.py +4 -4
- transformers/models/phimoe/modular_phimoe.py +2 -2
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pixtral/modeling_pixtral.py +2 -1
- transformers/models/plbart/modeling_plbart.py +6 -0
- transformers/models/plbart/modular_plbart.py +2 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/modeling_poolformer.py +2 -0
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +3 -0
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
- transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
- transformers/models/rag/modeling_rag.py +1 -0
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
- transformers/models/reformer/modeling_reformer.py +4 -0
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +6 -1
- transformers/models/rembert/modeling_rembert.py +6 -0
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +11 -2
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/modeling_rt_detr.py +2 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
- transformers/models/rwkv/modeling_rwkv.py +1 -0
- transformers/models/sam2/modeling_sam2.py +2 -2
- transformers/models/sam2/modular_sam2.py +2 -2
- transformers/models/sam2_video/modeling_sam2_video.py +1 -0
- transformers/models/sam2_video/modular_sam2_video.py +1 -0
- transformers/models/sam3/modeling_sam3.py +77 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
- transformers/models/sam3_video/modeling_sam3_video.py +1 -0
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
- transformers/models/seed_oss/modeling_seed_oss.py +2 -2
- transformers/models/segformer/modeling_segformer.py +4 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/siglip2/modeling_siglip2.py +4 -0
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
- transformers/models/speecht5/modeling_speecht5.py +13 -1
- transformers/models/splinter/modeling_splinter.py +3 -0
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +6 -0
- transformers/models/stablelm/modeling_stablelm.py +3 -1
- transformers/models/starcoder2/modeling_starcoder2.py +4 -3
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +2 -0
- transformers/models/swin/modeling_swin.py +4 -0
- transformers/models/swin2sr/modeling_swin2sr.py +2 -0
- transformers/models/swinv2/modeling_swinv2.py +4 -0
- transformers/models/t5/modeling_t5.py +7 -0
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +5 -5
- transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
- transformers/models/table_transformer/modeling_table_transformer.py +4 -0
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +2 -0
- transformers/models/timesfm/modular_timesfm.py +2 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
- transformers/models/trocr/modeling_trocr.py +2 -0
- transformers/models/tvp/modeling_tvp.py +2 -0
- transformers/models/udop/modeling_udop.py +4 -0
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/modeling_umt5.py +7 -0
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/vilt/modeling_vilt.py +6 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +6 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/modeling_vitmatte.py +1 -0
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/modeling_whisper.py +6 -0
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +3 -0
- transformers/models/xglm/modeling_xglm.py +1 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +5 -0
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/yoso/modeling_yoso.py +6 -0
- transformers/models/zamba/modeling_zamba.py +2 -0
- transformers/models/zamba2/modeling_zamba2.py +4 -2
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/modeling_zoedepth.py +1 -0
- transformers/pipelines/__init__.py +2 -3
- transformers/pipelines/base.py +1 -9
- transformers/pipelines/document_question_answering.py +3 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/processing_utils.py +23 -11
- transformers/quantizers/base.py +35 -110
- transformers/quantizers/quantizer_aqlm.py +1 -5
- transformers/quantizers/quantizer_auto_round.py +1 -2
- transformers/quantizers/quantizer_awq.py +17 -81
- transformers/quantizers/quantizer_bitnet.py +3 -8
- transformers/quantizers/quantizer_bnb_4bit.py +13 -110
- transformers/quantizers/quantizer_bnb_8bit.py +16 -92
- transformers/quantizers/quantizer_compressed_tensors.py +1 -5
- transformers/quantizers/quantizer_eetq.py +14 -62
- transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
- transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
- transformers/quantizers/quantizer_fp_quant.py +48 -78
- transformers/quantizers/quantizer_gptq.py +7 -24
- transformers/quantizers/quantizer_higgs.py +40 -54
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +13 -167
- transformers/quantizers/quantizer_quanto.py +20 -64
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +1 -4
- transformers/quantizers/quantizer_torchao.py +23 -202
- transformers/quantizers/quantizer_vptq.py +8 -22
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +297 -36
- transformers/tokenization_mistral_common.py +4 -0
- transformers/tokenization_utils_base.py +113 -222
- transformers/tokenization_utils_tokenizers.py +168 -107
- transformers/trainer.py +28 -31
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +66 -28
- transformers/utils/__init__.py +3 -4
- transformers/utils/auto_docstring.py +1 -0
- transformers/utils/generic.py +27 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +61 -16
- transformers/utils/kernel_config.py +4 -2
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +75 -242
- transformers/video_processing_utils.py +1 -2
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
|
-
from typing import Optional
|
|
16
|
+
from typing import Optional, Union
|
|
17
17
|
|
|
18
18
|
from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
|
|
19
19
|
from tokenizers.models import BPE
|
|
@@ -83,13 +83,15 @@ class NllbTokenizer(TokenizersBackend):
|
|
|
83
83
|
|
|
84
84
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
85
85
|
model_input_names = ["input_ids", "attention_mask"]
|
|
86
|
-
|
|
86
|
+
model = BPE
|
|
87
87
|
|
|
88
88
|
prefix_tokens: list[int] = []
|
|
89
89
|
suffix_tokens: list[int] = []
|
|
90
90
|
|
|
91
91
|
def __init__(
|
|
92
92
|
self,
|
|
93
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
94
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
93
95
|
bos_token="<s>",
|
|
94
96
|
eos_token="</s>",
|
|
95
97
|
sep_token="</s>",
|
|
@@ -101,16 +103,11 @@ class NllbTokenizer(TokenizersBackend):
|
|
|
101
103
|
tgt_lang=None,
|
|
102
104
|
additional_special_tokens=None,
|
|
103
105
|
legacy_behaviour=False,
|
|
104
|
-
vocab=None,
|
|
105
|
-
merges=None,
|
|
106
|
-
vocab_file=None,
|
|
107
106
|
**kwargs,
|
|
108
107
|
):
|
|
109
108
|
if additional_special_tokens is None:
|
|
110
109
|
additional_special_tokens = kwargs.get("extra_special_tokens", FAIRSEQ_LANGUAGE_CODES)
|
|
111
110
|
|
|
112
|
-
self.vocab_file = vocab_file
|
|
113
|
-
|
|
114
111
|
mask_token = (
|
|
115
112
|
AddedToken(mask_token, normalized=True, lstrip=True, special=True)
|
|
116
113
|
if isinstance(mask_token, str)
|
|
@@ -118,23 +115,15 @@ class NllbTokenizer(TokenizersBackend):
|
|
|
118
115
|
)
|
|
119
116
|
self.legacy_behaviour = legacy_behaviour
|
|
120
117
|
|
|
121
|
-
if vocab is
|
|
122
|
-
|
|
123
|
-
self._vocab = {token: idx for idx, (token, _score) in enumerate(vocab)}
|
|
124
|
-
else:
|
|
125
|
-
self._vocab = vocab
|
|
126
|
-
else:
|
|
127
|
-
self._vocab = {
|
|
118
|
+
if vocab is None:
|
|
119
|
+
vocab = {
|
|
128
120
|
str(bos_token): 0,
|
|
129
121
|
str(pad_token): 1,
|
|
130
122
|
str(eos_token): 2,
|
|
131
123
|
str(unk_token): 3,
|
|
132
124
|
}
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
self._merges = []
|
|
136
|
-
else:
|
|
137
|
-
self._merges = merges
|
|
125
|
+
self._vocab = vocab
|
|
126
|
+
self._merges = merges or []
|
|
138
127
|
|
|
139
128
|
self._tokenizer = Tokenizer(
|
|
140
129
|
BPE(
|
|
@@ -158,13 +147,10 @@ class NllbTokenizer(TokenizersBackend):
|
|
|
158
147
|
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)
|
|
159
148
|
self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
|
|
160
149
|
|
|
161
|
-
tokenizer_object = self._tokenizer
|
|
162
|
-
|
|
163
150
|
# Remove extra_special_tokens from kwargs if present to avoid conflict
|
|
164
151
|
kwargs.pop("extra_special_tokens", None)
|
|
165
152
|
|
|
166
153
|
super().__init__(
|
|
167
|
-
tokenizer_object=tokenizer_object,
|
|
168
154
|
bos_token=bos_token,
|
|
169
155
|
eos_token=eos_token,
|
|
170
156
|
sep_token=sep_token,
|
|
@@ -380,16 +380,16 @@ class NougatTokenizer(TokenizersBackend):
|
|
|
380
380
|
pad_token (`str`, *optional*, defaults to `"<pad>"`):
|
|
381
381
|
The token used for padding, for example when batching sequences of different lengths.
|
|
382
382
|
|
|
383
|
-
vocab (`dict`, *optional*):
|
|
383
|
+
vocab (`str`, `dict` or `list`, *optional*):
|
|
384
384
|
Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
|
|
385
385
|
|
|
386
|
-
merges (`list`, *optional*):
|
|
386
|
+
merges (`str` or `list`, *optional*):
|
|
387
387
|
Custom merges list. If not provided, merges are loaded from merges_file.
|
|
388
388
|
"""
|
|
389
389
|
|
|
390
390
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
391
391
|
model_input_names = ["input_ids", "attention_mask"]
|
|
392
|
-
|
|
392
|
+
model = BPE
|
|
393
393
|
|
|
394
394
|
def __init__(
|
|
395
395
|
self,
|
|
@@ -398,28 +398,22 @@ class NougatTokenizer(TokenizersBackend):
|
|
|
398
398
|
bos_token: str = "<s>",
|
|
399
399
|
eos_token: str = "</s>",
|
|
400
400
|
pad_token: str = "<pad>",
|
|
401
|
-
vocab: Optional[dict] = None,
|
|
402
|
-
merges: Optional[list] = None,
|
|
401
|
+
vocab: Optional[Union[str, dict, list]] = None,
|
|
402
|
+
merges: Optional[Union[str, list]] = None,
|
|
403
403
|
**kwargs,
|
|
404
404
|
):
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
else:
|
|
410
|
-
self._vocab = {
|
|
405
|
+
self._vocab = (
|
|
406
|
+
vocab
|
|
407
|
+
if vocab is not None
|
|
408
|
+
else {
|
|
411
409
|
str(bos_token): 0,
|
|
412
410
|
str(pad_token): 1,
|
|
413
411
|
str(eos_token): 2,
|
|
414
412
|
str(unk_token): 3,
|
|
415
413
|
"[START_REF]": 4,
|
|
416
414
|
}
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
self._merges = merges
|
|
420
|
-
else:
|
|
421
|
-
self._merges = []
|
|
422
|
-
|
|
415
|
+
)
|
|
416
|
+
self._merges = merges or []
|
|
423
417
|
self._tokenizer = Tokenizer(
|
|
424
418
|
BPE(
|
|
425
419
|
vocab=self._vocab,
|
|
@@ -464,10 +458,7 @@ class NougatTokenizer(TokenizersBackend):
|
|
|
464
458
|
self._tokenizer.enable_truncation(max_length=4096)
|
|
465
459
|
self._tokenizer.enable_padding(length=4096, pad_id=pad_token_id, pad_token=str(pad_token))
|
|
466
460
|
|
|
467
|
-
tokenizer_object = self._tokenizer
|
|
468
|
-
|
|
469
461
|
super().__init__(
|
|
470
|
-
tokenizer_object=tokenizer_object,
|
|
471
462
|
errors=errors,
|
|
472
463
|
unk_token=unk_token,
|
|
473
464
|
bos_token=bos_token,
|
|
@@ -476,45 +467,6 @@ class NougatTokenizer(TokenizersBackend):
|
|
|
476
467
|
**kwargs,
|
|
477
468
|
)
|
|
478
469
|
|
|
479
|
-
def _post_init(self):
|
|
480
|
-
"""Post-initialization to ensure tokenizer settings are applied correctly."""
|
|
481
|
-
# Re-apply settings to ensure they're correct after loading from pretrained
|
|
482
|
-
self._tokenizer.normalizer = normalizers.NFKC()
|
|
483
|
-
self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
|
484
|
-
[
|
|
485
|
-
pre_tokenizers.Split(pattern="SPL1T-TH1S-Pl3A5E", behavior="removed", invert=False),
|
|
486
|
-
pre_tokenizers.Digits(individual_digits=True),
|
|
487
|
-
pre_tokenizers.Split(
|
|
488
|
-
pattern=r"[\(\)\[\]\{\}]|([!\"#\$%\&'\*\+,\-\./:;<=>\?\\\^_`\|\~])\1*",
|
|
489
|
-
behavior="isolated",
|
|
490
|
-
invert=False,
|
|
491
|
-
),
|
|
492
|
-
pre_tokenizers.Split(pattern="\n", behavior="isolated", invert=False),
|
|
493
|
-
pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True),
|
|
494
|
-
]
|
|
495
|
-
)
|
|
496
|
-
self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
|
|
497
|
-
|
|
498
|
-
# Set up post processor with bos and eos tokens
|
|
499
|
-
bos_token_id = self.bos_token_id if self.bos_token_id is not None else 0
|
|
500
|
-
eos_token_id = self.eos_token_id if self.eos_token_id is not None else 2
|
|
501
|
-
pad_token_id = self.pad_token_id if self.pad_token_id is not None else 1
|
|
502
|
-
self._tokenizer.post_processor = processors.TemplateProcessing(
|
|
503
|
-
single=f"{self.bos_token}:0 $A:0 {self.eos_token}:0",
|
|
504
|
-
pair="$A:0 $B:1",
|
|
505
|
-
special_tokens=[
|
|
506
|
-
(str(self.eos_token), eos_token_id),
|
|
507
|
-
(str(self.bos_token), bos_token_id),
|
|
508
|
-
],
|
|
509
|
-
)
|
|
510
|
-
|
|
511
|
-
# Enable truncation and padding
|
|
512
|
-
self._tokenizer.enable_truncation(max_length=4096)
|
|
513
|
-
self._tokenizer.enable_padding(length=4096, pad_id=pad_token_id, pad_token=str(self.pad_token))
|
|
514
|
-
|
|
515
|
-
# Call parent to handle AddedToken properties
|
|
516
|
-
super()._post_init()
|
|
517
|
-
|
|
518
470
|
def remove_hallucinated_references(self, text: str) -> str:
|
|
519
471
|
"""
|
|
520
472
|
Remove hallucinated or missing references from the text.
|
|
@@ -443,6 +443,7 @@ class NystromformerModel(NystromformerPreTrainedModel):
|
|
|
443
443
|
output_attentions: Optional[bool] = None,
|
|
444
444
|
output_hidden_states: Optional[bool] = None,
|
|
445
445
|
return_dict: Optional[bool] = None,
|
|
446
|
+
**kwargs,
|
|
446
447
|
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
|
|
447
448
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
448
449
|
output_hidden_states = (
|
|
@@ -539,6 +540,7 @@ class NystromformerForMaskedLM(NystromformerPreTrainedModel):
|
|
|
539
540
|
output_attentions: Optional[bool] = None,
|
|
540
541
|
output_hidden_states: Optional[bool] = None,
|
|
541
542
|
return_dict: Optional[bool] = None,
|
|
543
|
+
**kwargs,
|
|
542
544
|
) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
|
|
543
545
|
r"""
|
|
544
546
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -628,6 +630,7 @@ class NystromformerForSequenceClassification(NystromformerPreTrainedModel):
|
|
|
628
630
|
output_attentions: Optional[bool] = None,
|
|
629
631
|
output_hidden_states: Optional[bool] = None,
|
|
630
632
|
return_dict: Optional[bool] = None,
|
|
633
|
+
**kwargs,
|
|
631
634
|
) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
|
|
632
635
|
r"""
|
|
633
636
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -709,6 +712,7 @@ class NystromformerForMultipleChoice(NystromformerPreTrainedModel):
|
|
|
709
712
|
output_attentions: Optional[bool] = None,
|
|
710
713
|
output_hidden_states: Optional[bool] = None,
|
|
711
714
|
return_dict: Optional[bool] = None,
|
|
715
|
+
**kwargs,
|
|
712
716
|
) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]:
|
|
713
717
|
r"""
|
|
714
718
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -814,6 +818,7 @@ class NystromformerForTokenClassification(NystromformerPreTrainedModel):
|
|
|
814
818
|
output_attentions: Optional[bool] = None,
|
|
815
819
|
output_hidden_states: Optional[bool] = None,
|
|
816
820
|
return_dict: Optional[bool] = None,
|
|
821
|
+
**kwargs,
|
|
817
822
|
) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
|
|
818
823
|
r"""
|
|
819
824
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -881,6 +886,7 @@ class NystromformerForQuestionAnswering(NystromformerPreTrainedModel):
|
|
|
881
886
|
output_attentions: Optional[bool] = None,
|
|
882
887
|
output_hidden_states: Optional[bool] = None,
|
|
883
888
|
return_dict: Optional[bool] = None,
|
|
889
|
+
**kwargs,
|
|
884
890
|
) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
|
|
885
891
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
886
892
|
|
|
@@ -34,6 +34,7 @@ import torch.nn.functional as F
|
|
|
34
34
|
from ...activations import ACT2FN
|
|
35
35
|
from ...cache_utils import Cache, DynamicCache
|
|
36
36
|
from ...generation import GenerationMixin
|
|
37
|
+
from ...integrations import use_kernelized_func
|
|
37
38
|
from ...masking_utils import create_causal_mask
|
|
38
39
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
39
40
|
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
|
@@ -41,7 +42,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
41
42
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
42
43
|
from ...processing_utils import Unpack
|
|
43
44
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
44
|
-
from ...utils.generic import check_model_inputs
|
|
45
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
45
46
|
from .configuration_olmo import OlmoConfig
|
|
46
47
|
|
|
47
48
|
|
|
@@ -131,7 +132,7 @@ class OlmoRotaryEmbedding(nn.Module):
|
|
|
131
132
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
132
133
|
|
|
133
134
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
134
|
-
with
|
|
135
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
135
136
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
136
137
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
137
138
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -212,6 +213,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
|
|
212
213
|
return q_embed.to(q_type), k_embed.to(k_type)
|
|
213
214
|
|
|
214
215
|
|
|
216
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
215
217
|
class OlmoAttention(nn.Module):
|
|
216
218
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
217
219
|
|
|
@@ -237,7 +239,6 @@ class OlmoAttention(nn.Module):
|
|
|
237
239
|
self.o_proj = nn.Linear(
|
|
238
240
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
239
241
|
)
|
|
240
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
241
242
|
|
|
242
243
|
def forward(
|
|
243
244
|
self,
|
|
@@ -246,7 +247,6 @@ class OlmoAttention(nn.Module):
|
|
|
246
247
|
attention_mask: Optional[torch.Tensor],
|
|
247
248
|
past_key_values: Optional[Cache] = None,
|
|
248
249
|
cache_position: Optional[torch.LongTensor] = None,
|
|
249
|
-
position_ids: Optional[torch.LongTensor] = None,
|
|
250
250
|
**kwargs,
|
|
251
251
|
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
|
252
252
|
input_shape = hidden_states.shape[:-1]
|
|
@@ -29,6 +29,7 @@ from ...cache_utils import Cache
|
|
|
29
29
|
from ...modeling_rope_utils import dynamic_rope_update
|
|
30
30
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
31
31
|
from ...utils import logging
|
|
32
|
+
from ...utils.generic import maybe_autocast
|
|
32
33
|
from ..llama.modeling_llama import (
|
|
33
34
|
LlamaAttention,
|
|
34
35
|
LlamaDecoderLayer,
|
|
@@ -77,7 +78,7 @@ class OlmoRotaryEmbedding(LlamaRotaryEmbedding):
|
|
|
77
78
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
78
79
|
|
|
79
80
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
80
|
-
with
|
|
81
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
81
82
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
82
83
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
83
84
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -121,7 +122,6 @@ class OlmoAttention(LlamaAttention):
|
|
|
121
122
|
attention_mask: Optional[torch.Tensor],
|
|
122
123
|
past_key_values: Optional[Cache] = None,
|
|
123
124
|
cache_position: Optional[torch.LongTensor] = None,
|
|
124
|
-
position_ids: Optional[torch.LongTensor] = None,
|
|
125
125
|
**kwargs,
|
|
126
126
|
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
|
127
127
|
input_shape = hidden_states.shape[:-1]
|
|
@@ -35,7 +35,7 @@ from transformers.utils.generic import TransformersKwargs
|
|
|
35
35
|
from ...activations import ACT2FN
|
|
36
36
|
from ...cache_utils import Cache, DynamicCache
|
|
37
37
|
from ...generation import GenerationMixin
|
|
38
|
-
from ...integrations import use_kernel_forward_from_hub
|
|
38
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
|
|
39
39
|
from ...masking_utils import create_causal_mask
|
|
40
40
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
41
41
|
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
|
@@ -43,7 +43,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
43
43
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
44
44
|
from ...processing_utils import Unpack
|
|
45
45
|
from ...utils import auto_docstring, can_return_tuple
|
|
46
|
-
from ...utils.generic import check_model_inputs
|
|
46
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
47
47
|
from .configuration_olmo2 import Olmo2Config
|
|
48
48
|
|
|
49
49
|
|
|
@@ -124,7 +124,7 @@ class Olmo2RotaryEmbedding(nn.Module):
|
|
|
124
124
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
125
125
|
|
|
126
126
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
127
|
-
with
|
|
127
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
128
128
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
129
129
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
130
130
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -205,6 +205,7 @@ def rotate_half(x):
|
|
|
205
205
|
return torch.cat((-x2, x1), dim=-1)
|
|
206
206
|
|
|
207
207
|
|
|
208
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
208
209
|
class Olmo2Attention(nn.Module):
|
|
209
210
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
210
211
|
|
|
@@ -230,7 +231,6 @@ class Olmo2Attention(nn.Module):
|
|
|
230
231
|
self.o_proj = nn.Linear(
|
|
231
232
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
232
233
|
)
|
|
233
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
234
234
|
self.q_norm = Olmo2RMSNorm(config.num_attention_heads * self.head_dim, config.rms_norm_eps)
|
|
235
235
|
self.k_norm = Olmo2RMSNorm(config.num_key_value_heads * self.head_dim, config.rms_norm_eps)
|
|
236
236
|
|
|
@@ -241,7 +241,6 @@ class Olmo2Attention(nn.Module):
|
|
|
241
241
|
attention_mask: Optional[torch.Tensor],
|
|
242
242
|
past_key_values: Optional[Cache] = None,
|
|
243
243
|
cache_position: Optional[torch.LongTensor] = None,
|
|
244
|
-
position_ids: Optional[torch.LongTensor] = None,
|
|
245
244
|
**kwargs: Unpack[TransformersKwargs],
|
|
246
245
|
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
|
247
246
|
input_shape = hidden_states.shape[:-1]
|
|
@@ -219,7 +219,6 @@ class Olmo2Attention(OlmoAttention):
|
|
|
219
219
|
attention_mask: Optional[torch.Tensor],
|
|
220
220
|
past_key_values: Optional[Cache] = None,
|
|
221
221
|
cache_position: Optional[torch.LongTensor] = None,
|
|
222
|
-
position_ids: Optional[torch.LongTensor] = None,
|
|
223
222
|
**kwargs: Unpack[TransformersKwargs],
|
|
224
223
|
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
|
225
224
|
input_shape = hidden_states.shape[:-1]
|
|
@@ -30,7 +30,7 @@ from transformers.utils.generic import TransformersKwargs
|
|
|
30
30
|
from ...activations import ACT2FN
|
|
31
31
|
from ...cache_utils import Cache, DynamicCache
|
|
32
32
|
from ...generation import GenerationMixin
|
|
33
|
-
from ...integrations import use_kernel_forward_from_hub
|
|
33
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
|
|
34
34
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
35
35
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
36
36
|
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
|
@@ -38,7 +38,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
38
38
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
39
39
|
from ...processing_utils import Unpack
|
|
40
40
|
from ...utils import auto_docstring, can_return_tuple
|
|
41
|
-
from ...utils.generic import check_model_inputs
|
|
41
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
42
42
|
from .configuration_olmo3 import Olmo3Config
|
|
43
43
|
|
|
44
44
|
|
|
@@ -136,6 +136,7 @@ def rotate_half(x):
|
|
|
136
136
|
return torch.cat((-x2, x1), dim=-1)
|
|
137
137
|
|
|
138
138
|
|
|
139
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
139
140
|
class Olmo3Attention(nn.Module):
|
|
140
141
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
141
142
|
|
|
@@ -161,7 +162,6 @@ class Olmo3Attention(nn.Module):
|
|
|
161
162
|
self.o_proj = nn.Linear(
|
|
162
163
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
163
164
|
)
|
|
164
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
165
165
|
self.q_norm = Olmo3RMSNorm(config.num_attention_heads * self.head_dim, config.rms_norm_eps)
|
|
166
166
|
self.k_norm = Olmo3RMSNorm(config.num_key_value_heads * self.head_dim, config.rms_norm_eps)
|
|
167
167
|
assert config.layer_types is not None
|
|
@@ -332,7 +332,7 @@ class Olmo3RotaryEmbedding(nn.Module):
|
|
|
332
332
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
333
333
|
|
|
334
334
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
335
|
-
with
|
|
335
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
336
336
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
337
337
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
338
338
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -27,7 +27,7 @@ from ... import initialization as init
|
|
|
27
27
|
from ...activations import ACT2FN
|
|
28
28
|
from ...cache_utils import Cache, DynamicCache
|
|
29
29
|
from ...generation import GenerationMixin
|
|
30
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
|
|
30
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
31
31
|
from ...masking_utils import create_causal_mask
|
|
32
32
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
33
33
|
from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
|
|
@@ -35,7 +35,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
35
35
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
36
36
|
from ...processing_utils import Unpack
|
|
37
37
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
38
|
-
from ...utils.generic import OutputRecorder, check_model_inputs
|
|
38
|
+
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
|
|
39
39
|
from .configuration_olmoe import OlmoeConfig
|
|
40
40
|
|
|
41
41
|
|
|
@@ -116,7 +116,7 @@ class OlmoeRotaryEmbedding(nn.Module):
|
|
|
116
116
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
117
117
|
|
|
118
118
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
119
|
-
with
|
|
119
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
120
120
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
121
121
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
122
122
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -214,6 +214,7 @@ def eager_attention_forward(
|
|
|
214
214
|
return attn_output, attn_weights
|
|
215
215
|
|
|
216
216
|
|
|
217
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
217
218
|
class OlmoeAttention(nn.Module):
|
|
218
219
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
219
220
|
|
|
@@ -239,7 +240,6 @@ class OlmoeAttention(nn.Module):
|
|
|
239
240
|
self.o_proj = nn.Linear(
|
|
240
241
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
241
242
|
)
|
|
242
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
243
243
|
self.q_norm = OlmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
|
244
244
|
self.k_norm = OlmoeRMSNorm(
|
|
245
245
|
(config.hidden_size // config.num_attention_heads) * config.num_key_value_heads, eps=config.rms_norm_eps
|
|
@@ -1316,6 +1316,7 @@ class OmDetTurboDecoder(OmDetTurboPreTrainedModel):
|
|
|
1316
1316
|
output_attentions=None,
|
|
1317
1317
|
output_hidden_states=None,
|
|
1318
1318
|
return_dict=None,
|
|
1319
|
+
**kwargs,
|
|
1319
1320
|
):
|
|
1320
1321
|
"""
|
|
1321
1322
|
Args:
|
|
@@ -1505,6 +1506,7 @@ class OmDetTurboForObjectDetection(OmDetTurboPreTrainedModel):
|
|
|
1505
1506
|
output_attentions: Optional[bool] = None,
|
|
1506
1507
|
output_hidden_states: Optional[bool] = None,
|
|
1507
1508
|
return_dict: Optional[bool] = None,
|
|
1509
|
+
**kwargs,
|
|
1508
1510
|
) -> Union[tuple[torch.FloatTensor], OmDetTurboObjectDetectionOutput]:
|
|
1509
1511
|
r"""
|
|
1510
1512
|
classes_input_ids (`torch.LongTensor` of shape `(total_classes (>= batch_size), sequence_length)`):
|
|
@@ -39,6 +39,7 @@ from ...utils import (
|
|
|
39
39
|
requires_backends,
|
|
40
40
|
)
|
|
41
41
|
from ...utils.backbone_utils import load_backbone
|
|
42
|
+
from ...utils.generic import maybe_autocast
|
|
42
43
|
from .configuration_oneformer import OneFormerConfig
|
|
43
44
|
|
|
44
45
|
|
|
@@ -322,7 +323,7 @@ class OneFormerHungarianMatcher(nn.Module):
|
|
|
322
323
|
align_corners=False,
|
|
323
324
|
).squeeze(1)
|
|
324
325
|
|
|
325
|
-
with
|
|
326
|
+
with maybe_autocast(device_type="cuda", enabled=False):
|
|
326
327
|
pred_mask = pred_mask.float()
|
|
327
328
|
target_mask = target_mask.float()
|
|
328
329
|
|
|
@@ -2872,6 +2873,7 @@ class OneFormerModel(OneFormerPreTrainedModel):
|
|
|
2872
2873
|
output_hidden_states: Optional[bool] = None,
|
|
2873
2874
|
output_attentions: Optional[bool] = None,
|
|
2874
2875
|
return_dict: Optional[bool] = None,
|
|
2876
|
+
**kwargs,
|
|
2875
2877
|
) -> OneFormerModelOutput:
|
|
2876
2878
|
r"""
|
|
2877
2879
|
task_inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -3058,6 +3060,7 @@ class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel):
|
|
|
3058
3060
|
output_hidden_states: Optional[bool] = None,
|
|
3059
3061
|
output_attentions: Optional[bool] = None,
|
|
3060
3062
|
return_dict: Optional[bool] = None,
|
|
3063
|
+
**kwargs,
|
|
3061
3064
|
) -> OneFormerForUniversalSegmentationOutput:
|
|
3062
3065
|
r"""
|
|
3063
3066
|
task_inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -317,6 +317,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|
|
317
317
|
output_attentions: Optional[bool] = None,
|
|
318
318
|
output_hidden_states: Optional[bool] = None,
|
|
319
319
|
return_dict: Optional[bool] = None,
|
|
320
|
+
**kwargs,
|
|
320
321
|
) -> Union[tuple[torch.Tensor], BaseModelOutput]:
|
|
321
322
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
322
323
|
output_hidden_states = (
|
|
@@ -514,6 +515,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|
|
514
515
|
output_attentions: Optional[bool] = None,
|
|
515
516
|
output_hidden_states: Optional[bool] = None,
|
|
516
517
|
return_dict: Optional[bool] = None,
|
|
518
|
+
**kwargs,
|
|
517
519
|
) -> Union[tuple[torch.Tensor], OpenAIGPTDoubleHeadsModelOutput]:
|
|
518
520
|
r"""
|
|
519
521
|
mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
|
|
@@ -624,6 +626,7 @@ class OpenAIGPTForSequenceClassification(OpenAIGPTPreTrainedModel):
|
|
|
624
626
|
output_attentions: Optional[bool] = None,
|
|
625
627
|
output_hidden_states: Optional[bool] = None,
|
|
626
628
|
return_dict: Optional[bool] = None,
|
|
629
|
+
**kwargs,
|
|
627
630
|
) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
|
|
628
631
|
r"""
|
|
629
632
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -14,10 +14,11 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""Tokenization classes for OpenAI GPT."""
|
|
16
16
|
|
|
17
|
+
from typing import Optional, Union
|
|
18
|
+
|
|
17
19
|
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
|
|
18
20
|
from tokenizers.models import BPE
|
|
19
21
|
|
|
20
|
-
from ...convert_slow_tokenizer import generate_merges
|
|
21
22
|
from ...tokenization_utils_tokenizers import TokenizersBackend
|
|
22
23
|
from ...utils import logging
|
|
23
24
|
|
|
@@ -48,40 +49,26 @@ class OpenAIGPTTokenizer(TokenizersBackend):
|
|
|
48
49
|
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
|
49
50
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
|
50
51
|
token instead.
|
|
51
|
-
vocab (`dict`, *optional*):
|
|
52
|
+
vocab (`str` or `dict[str, int]`, *optional*):
|
|
52
53
|
Custom vocabulary dictionary. If not provided, a blank vocabulary is initialized.
|
|
53
|
-
merges (`list`, *optional*):
|
|
54
|
+
merges (`str` or `list[str]`, *optional*):
|
|
54
55
|
Custom merges list. If not provided, an empty list is used.
|
|
55
56
|
"""
|
|
56
57
|
|
|
57
58
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
58
59
|
model_input_names = ["input_ids", "attention_mask"]
|
|
60
|
+
model = BPE
|
|
59
61
|
|
|
60
62
|
def __init__(
|
|
61
63
|
self,
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
vocab_file=None,
|
|
66
|
-
merges_file=None,
|
|
64
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
65
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
66
|
+
unk_token: str = "<unk>",
|
|
67
67
|
**kwargs,
|
|
68
68
|
):
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
self._vocab = (
|
|
72
|
-
{token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
|
|
73
|
-
)
|
|
74
|
-
else:
|
|
75
|
-
# Initialize minimal vocabulary with unk token
|
|
76
|
-
self._vocab = {str(unk_token): 0}
|
|
77
|
-
|
|
78
|
-
# Initialize merges
|
|
79
|
-
if merges is not None:
|
|
80
|
-
self._merges = merges if merges is not None else generate_merges(self._vocab)
|
|
81
|
-
else:
|
|
82
|
-
self._merges = []
|
|
69
|
+
self._vocab = vocab if vocab is not None else {str(unk_token): 0}
|
|
70
|
+
self._merges = merges or []
|
|
83
71
|
|
|
84
|
-
# Create BPE tokenizer
|
|
85
72
|
self._tokenizer = Tokenizer(
|
|
86
73
|
BPE(
|
|
87
74
|
vocab=self._vocab,
|
|
@@ -107,34 +94,11 @@ class OpenAIGPTTokenizer(TokenizersBackend):
|
|
|
107
94
|
self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
|
108
95
|
self._tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")
|
|
109
96
|
|
|
110
|
-
tokenizer_object = self._tokenizer
|
|
111
|
-
|
|
112
97
|
super().__init__(
|
|
113
|
-
tokenizer_object=tokenizer_object,
|
|
114
98
|
unk_token=unk_token,
|
|
115
99
|
**kwargs,
|
|
116
100
|
)
|
|
117
101
|
|
|
118
|
-
self.vocab_file = vocab_file
|
|
119
|
-
self.merges_file = merges_file
|
|
120
|
-
|
|
121
|
-
def _post_init(self):
|
|
122
|
-
"""Post-initialization to ensure tokenizer settings are applied correctly."""
|
|
123
|
-
# Re-apply settings to ensure they're correct after loading from pretrained
|
|
124
|
-
self._tokenizer.normalizer = normalizers.Sequence(
|
|
125
|
-
[
|
|
126
|
-
normalizers.NFD(),
|
|
127
|
-
normalizers.Lowercase(),
|
|
128
|
-
normalizers.StripAccents(),
|
|
129
|
-
]
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
|
133
|
-
self._tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")
|
|
134
|
-
|
|
135
|
-
# Call parent to handle AddedToken properties
|
|
136
|
-
super()._post_init()
|
|
137
|
-
|
|
138
102
|
@property
|
|
139
103
|
def do_lower_case(self):
|
|
140
104
|
return True
|
|
@@ -836,6 +836,7 @@ class OPTForSequenceClassification(OPTPreTrainedModel):
|
|
|
836
836
|
output_hidden_states: Optional[bool] = None,
|
|
837
837
|
return_dict: Optional[bool] = None,
|
|
838
838
|
position_ids: Optional[torch.LongTensor] = None,
|
|
839
|
+
**kwargs,
|
|
839
840
|
) -> Union[tuple, SequenceClassifierOutputWithPast]:
|
|
840
841
|
r"""
|
|
841
842
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -947,6 +948,7 @@ class OPTForQuestionAnswering(OPTPreTrainedModel):
|
|
|
947
948
|
output_hidden_states: Optional[bool] = None,
|
|
948
949
|
return_dict: Optional[bool] = None,
|
|
949
950
|
position_ids: Optional[torch.LongTensor] = None,
|
|
951
|
+
**kwargs,
|
|
950
952
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
951
953
|
r"""
|
|
952
954
|
Example:
|