transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +30 -3
- transformers/cli/serve.py +47 -17
- transformers/conversion_mapping.py +15 -2
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +196 -135
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +1 -2
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +1 -2
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/configuration_utils.py +3 -2
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/continuous_api.py +134 -79
- transformers/image_processing_base.py +1 -2
- transformers/integrations/__init__.py +4 -2
- transformers/integrations/accelerate.py +15 -3
- transformers/integrations/aqlm.py +38 -66
- transformers/integrations/awq.py +48 -514
- transformers/integrations/bitnet.py +45 -100
- transformers/integrations/bitsandbytes.py +79 -191
- transformers/integrations/deepspeed.py +1 -0
- transformers/integrations/eetq.py +84 -79
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +236 -193
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +40 -62
- transformers/integrations/hub_kernels.py +42 -3
- transformers/integrations/integration_utils.py +10 -0
- transformers/integrations/mxfp4.py +25 -65
- transformers/integrations/peft.py +7 -29
- transformers/integrations/quanto.py +73 -55
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +44 -90
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +42 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +8 -0
- transformers/modeling_rope_utils.py +30 -6
- transformers/modeling_utils.py +116 -112
- transformers/models/__init__.py +3 -0
- transformers/models/afmoe/modeling_afmoe.py +4 -4
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +2 -0
- transformers/models/altclip/modeling_altclip.py +4 -0
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/modeling_aria.py +4 -4
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/auto/configuration_auto.py +11 -0
- transformers/models/auto/feature_extraction_auto.py +2 -0
- transformers/models/auto/image_processing_auto.py +1 -0
- transformers/models/auto/modeling_auto.py +6 -0
- transformers/models/auto/processing_auto.py +18 -10
- transformers/models/auto/tokenization_auto.py +74 -472
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/bamba/modeling_bamba.py +4 -3
- transformers/models/bark/modeling_bark.py +2 -0
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/big_bird/modeling_big_bird.py +6 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +11 -2
- transformers/models/bitnet/modeling_bitnet.py +4 -4
- transformers/models/blenderbot/modeling_blenderbot.py +5 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
- transformers/models/blip/modeling_blip_text.py +2 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -1
- transformers/models/bloom/modeling_bloom.py +4 -0
- transformers/models/blt/modeling_blt.py +2 -2
- transformers/models/blt/modular_blt.py +2 -2
- transformers/models/bridgetower/modeling_bridgetower.py +5 -1
- transformers/models/bros/modeling_bros.py +4 -0
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +5 -0
- transformers/models/chameleon/modeling_chameleon.py +2 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
- transformers/models/clap/modeling_clap.py +5 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +5 -0
- transformers/models/clvp/modeling_clvp.py +5 -0
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +4 -3
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +7 -6
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
- transformers/models/convbert/modeling_convbert.py +6 -0
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/modeling_csm.py +4 -3
- transformers/models/ctrl/modeling_ctrl.py +1 -0
- transformers/models/cvt/modeling_cvt.py +2 -0
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/modeling_d_fine.py +2 -0
- transformers/models/d_fine/modular_d_fine.py +1 -0
- transformers/models/dab_detr/modeling_dab_detr.py +4 -0
- transformers/models/dac/modeling_dac.py +2 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/dbrx/modeling_dbrx.py +2 -2
- transformers/models/deberta/modeling_deberta.py +5 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
- transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
- transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
- transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/modeling_detr.py +5 -0
- transformers/models/dia/modeling_dia.py +4 -3
- transformers/models/dia/modular_dia.py +0 -1
- transformers/models/diffllama/modeling_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +2 -3
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +2 -0
- transformers/models/dots1/modeling_dots1.py +10 -7
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/edgetam/modeling_edgetam.py +1 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
- transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
- transformers/models/efficientnet/modeling_efficientnet.py +2 -0
- transformers/models/emu3/modeling_emu3.py +4 -4
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +14 -2
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
- transformers/models/esm/modeling_esmfold.py +5 -4
- transformers/models/evolla/modeling_evolla.py +4 -4
- transformers/models/exaone4/modeling_exaone4.py +2 -2
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +6 -1
- transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
- transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
- transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
- transformers/models/flaubert/modeling_flaubert.py +7 -0
- transformers/models/flava/modeling_flava.py +6 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
- transformers/models/florence2/modeling_florence2.py +2 -1
- transformers/models/florence2/modular_florence2.py +2 -1
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/processing_fuyu.py +3 -3
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +2 -1
- transformers/models/gemma3/modeling_gemma3.py +14 -84
- transformers/models/gemma3/modular_gemma3.py +12 -81
- transformers/models/gemma3n/modeling_gemma3n.py +18 -209
- transformers/models/gemma3n/modular_gemma3n.py +17 -59
- transformers/models/git/modeling_git.py +2 -0
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/modeling_glm4v.py +3 -3
- transformers/models/glm4v/modular_glm4v.py +6 -4
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
- transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/gpt2/modeling_gpt2.py +5 -1
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
- transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
- transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
- transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
- transformers/models/gptj/modeling_gptj.py +3 -0
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granitemoe/modeling_granitemoe.py +4 -6
- transformers/models/granitemoe/modular_granitemoe.py +0 -2
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
- transformers/models/groupvit/modeling_groupvit.py +3 -0
- transformers/models/helium/modeling_helium.py +4 -3
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +3 -0
- transformers/models/hubert/modular_hubert.py +1 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
- transformers/models/ibert/modeling_ibert.py +6 -0
- transformers/models/idefics/modeling_idefics.py +5 -21
- transformers/models/imagegpt/modeling_imagegpt.py +2 -1
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/internvl/modeling_internvl.py +2 -4
- transformers/models/internvl/modular_internvl.py +2 -4
- transformers/models/jamba/modeling_jamba.py +2 -2
- transformers/models/janus/modeling_janus.py +1 -0
- transformers/models/janus/modular_janus.py +1 -0
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/kosmos2/modeling_kosmos2.py +1 -0
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +244 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +729 -0
- transformers/models/lasr/modular_lasr.py +569 -0
- transformers/models/lasr/processing_lasr.py +96 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +5 -0
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +3 -0
- transformers/models/lfm2/modeling_lfm2.py +4 -5
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +4 -0
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/modeling_llama4.py +3 -2
- transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
- transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -0
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +4 -0
- transformers/models/mamba/modeling_mamba.py +14 -22
- transformers/models/marian/modeling_marian.py +5 -0
- transformers/models/markuplm/modeling_markuplm.py +4 -0
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/modeling_mask2former.py +2 -0
- transformers/models/maskformer/modeling_maskformer.py +2 -0
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +3 -1
- transformers/models/minimax/modeling_minimax.py +4 -4
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +4 -3
- transformers/models/mistral/modeling_mistral.py +4 -3
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mllama/modeling_mllama.py +2 -2
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/modeling_mobilevit.py +3 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
- transformers/models/modernbert/modeling_modernbert.py +4 -1
- transformers/models/modernbert/modular_modernbert.py +2 -0
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
- transformers/models/moonshine/modeling_moonshine.py +4 -2
- transformers/models/moshi/modeling_moshi.py +5 -2
- transformers/models/mpnet/modeling_mpnet.py +5 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +6 -0
- transformers/models/mt5/modeling_mt5.py +7 -0
- transformers/models/musicgen/modeling_musicgen.py +2 -0
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nemotron/modeling_nemotron.py +4 -2
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nougat/tokenization_nougat.py +11 -59
- transformers/models/nystromformer/modeling_nystromformer.py +6 -0
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +4 -5
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
- transformers/models/oneformer/modeling_oneformer.py +4 -1
- transformers/models/openai/modeling_openai.py +3 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/owlv2/modeling_owlv2.py +4 -0
- transformers/models/owlvit/modeling_owlvit.py +4 -0
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +9 -6
- transformers/models/parakeet/modular_parakeet.py +2 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
- transformers/models/patchtst/modeling_patchtst.py +20 -2
- transformers/models/pegasus/modeling_pegasus.py +5 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
- transformers/models/perceiver/modeling_perceiver.py +8 -0
- transformers/models/persimmon/modeling_persimmon.py +2 -1
- transformers/models/phi/modeling_phi.py +4 -5
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +2 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
- transformers/models/phimoe/modeling_phimoe.py +4 -4
- transformers/models/phimoe/modular_phimoe.py +2 -2
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pixtral/modeling_pixtral.py +2 -1
- transformers/models/plbart/modeling_plbart.py +6 -0
- transformers/models/plbart/modular_plbart.py +2 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/modeling_poolformer.py +2 -0
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +3 -0
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
- transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
- transformers/models/rag/modeling_rag.py +1 -0
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
- transformers/models/reformer/modeling_reformer.py +4 -0
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +6 -1
- transformers/models/rembert/modeling_rembert.py +6 -0
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +11 -2
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/modeling_rt_detr.py +2 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
- transformers/models/rwkv/modeling_rwkv.py +1 -0
- transformers/models/sam2/modeling_sam2.py +2 -2
- transformers/models/sam2/modular_sam2.py +2 -2
- transformers/models/sam2_video/modeling_sam2_video.py +1 -0
- transformers/models/sam2_video/modular_sam2_video.py +1 -0
- transformers/models/sam3/modeling_sam3.py +77 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
- transformers/models/sam3_video/modeling_sam3_video.py +1 -0
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
- transformers/models/seed_oss/modeling_seed_oss.py +2 -2
- transformers/models/segformer/modeling_segformer.py +4 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/siglip2/modeling_siglip2.py +4 -0
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
- transformers/models/speecht5/modeling_speecht5.py +13 -1
- transformers/models/splinter/modeling_splinter.py +3 -0
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +6 -0
- transformers/models/stablelm/modeling_stablelm.py +3 -1
- transformers/models/starcoder2/modeling_starcoder2.py +4 -3
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +2 -0
- transformers/models/swin/modeling_swin.py +4 -0
- transformers/models/swin2sr/modeling_swin2sr.py +2 -0
- transformers/models/swinv2/modeling_swinv2.py +4 -0
- transformers/models/t5/modeling_t5.py +7 -0
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +5 -5
- transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
- transformers/models/table_transformer/modeling_table_transformer.py +4 -0
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +2 -0
- transformers/models/timesfm/modular_timesfm.py +2 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
- transformers/models/trocr/modeling_trocr.py +2 -0
- transformers/models/tvp/modeling_tvp.py +2 -0
- transformers/models/udop/modeling_udop.py +4 -0
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/modeling_umt5.py +7 -0
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/vilt/modeling_vilt.py +6 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +6 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/modeling_vitmatte.py +1 -0
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/modeling_whisper.py +6 -0
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +3 -0
- transformers/models/xglm/modeling_xglm.py +1 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +5 -0
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/yoso/modeling_yoso.py +6 -0
- transformers/models/zamba/modeling_zamba.py +2 -0
- transformers/models/zamba2/modeling_zamba2.py +4 -2
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/modeling_zoedepth.py +1 -0
- transformers/pipelines/__init__.py +2 -3
- transformers/pipelines/base.py +1 -9
- transformers/pipelines/document_question_answering.py +3 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/processing_utils.py +23 -11
- transformers/quantizers/base.py +35 -110
- transformers/quantizers/quantizer_aqlm.py +1 -5
- transformers/quantizers/quantizer_auto_round.py +1 -2
- transformers/quantizers/quantizer_awq.py +17 -81
- transformers/quantizers/quantizer_bitnet.py +3 -8
- transformers/quantizers/quantizer_bnb_4bit.py +13 -110
- transformers/quantizers/quantizer_bnb_8bit.py +16 -92
- transformers/quantizers/quantizer_compressed_tensors.py +1 -5
- transformers/quantizers/quantizer_eetq.py +14 -62
- transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
- transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
- transformers/quantizers/quantizer_fp_quant.py +48 -78
- transformers/quantizers/quantizer_gptq.py +7 -24
- transformers/quantizers/quantizer_higgs.py +40 -54
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +13 -167
- transformers/quantizers/quantizer_quanto.py +20 -64
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +1 -4
- transformers/quantizers/quantizer_torchao.py +23 -202
- transformers/quantizers/quantizer_vptq.py +8 -22
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +297 -36
- transformers/tokenization_mistral_common.py +4 -0
- transformers/tokenization_utils_base.py +113 -222
- transformers/tokenization_utils_tokenizers.py +168 -107
- transformers/trainer.py +28 -31
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +66 -28
- transformers/utils/__init__.py +3 -4
- transformers/utils/auto_docstring.py +1 -0
- transformers/utils/generic.py +27 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +61 -16
- transformers/utils/kernel_config.py +4 -2
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +75 -242
- transformers/video_processing_utils.py +1 -2
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
2
|
+
# This file was automatically generated from src/transformers/models/lasr/modular_lasr.py.
|
|
3
|
+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
|
4
|
+
# the file from the modular. If any change should be done, please apply the change to the
|
|
5
|
+
# modular_lasr.py file directly. One of our CI enforces this.
|
|
6
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
7
|
+
# coding=utf-8
|
|
8
|
+
# Copyright 2025 The HuggingFace Inc. team and Google LLC. All rights reserved.
|
|
9
|
+
#
|
|
10
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
11
|
+
# you may not use this file except in compliance with the License.
|
|
12
|
+
# You may obtain a copy of the License at
|
|
13
|
+
#
|
|
14
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
15
|
+
#
|
|
16
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
17
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
18
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
19
|
+
# See the License for the specific language governing permissions and
|
|
20
|
+
# limitations under the License.
|
|
21
|
+
|
|
22
|
+
import itertools
|
|
23
|
+
import re
|
|
24
|
+
from typing import Optional, Union
|
|
25
|
+
|
|
26
|
+
from tokenizers import Tokenizer, decoders, pre_tokenizers, processors
|
|
27
|
+
from tokenizers.models import Unigram
|
|
28
|
+
|
|
29
|
+
from ...tokenization_utils_tokenizers import TokenizersBackend
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class LasrTokenizer(TokenizersBackend):
|
|
36
|
+
"""
|
|
37
|
+
Construct a LASR tokenizer (backed by HuggingFace's *tokenizers* library). Based on
|
|
38
|
+
[Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
|
|
39
|
+
|
|
40
|
+
This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
|
|
41
|
+
refer to this superclass for more information regarding those methods.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
vocab_file (`str`, *optional*):
|
|
45
|
+
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
|
|
46
|
+
contains the vocabulary necessary to instantiate a tokenizer.
|
|
47
|
+
eos_token (`str`, *optional*, defaults to `"</s>"`):
|
|
48
|
+
The end of sequence token.
|
|
49
|
+
|
|
50
|
+
<Tip>
|
|
51
|
+
|
|
52
|
+
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
|
|
53
|
+
The token used is the `sep_token`.
|
|
54
|
+
|
|
55
|
+
</Tip>
|
|
56
|
+
|
|
57
|
+
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
|
58
|
+
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
|
59
|
+
token instead.
|
|
60
|
+
pad_token (`str`, *optional*, defaults to `"<pad>"`):
|
|
61
|
+
The token used for padding, for example when batching sequences of different lengths.
|
|
62
|
+
extra_ids (`int`, *optional*, defaults to 100):
|
|
63
|
+
Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are accessible as
|
|
64
|
+
"<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be retrieved by
|
|
65
|
+
calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids method
|
|
66
|
+
additional_special_tokens (`list[str]`, *optional*):
|
|
67
|
+
Additional special tokens used by the tokenizer.
|
|
68
|
+
vocab (`str`, `dict` or `list`, *optional*):
|
|
69
|
+
Custom vocabulary dict. If not provided, a minimal vocabulary is created using the special tokens.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
vocab_files_names = VOCAB_FILES_NAMES
|
|
73
|
+
model_input_names = ["input_ids", "attention_mask"]
|
|
74
|
+
model = Unigram
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
eos_token="</s>",
|
|
79
|
+
unk_token="<unk>",
|
|
80
|
+
pad_token="<pad>",
|
|
81
|
+
extra_ids=100,
|
|
82
|
+
additional_special_tokens=None,
|
|
83
|
+
vocab=None,
|
|
84
|
+
vocab_file=None,
|
|
85
|
+
**kwargs,
|
|
86
|
+
):
|
|
87
|
+
self._extra_ids = extra_ids
|
|
88
|
+
|
|
89
|
+
# Handle extra_ids and additional_special_tokens
|
|
90
|
+
if additional_special_tokens is not None:
|
|
91
|
+
extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
|
|
92
|
+
if len(extra_tokens) < 1:
|
|
93
|
+
additional_special_tokens += [f"<extra_id_{i}>" for i in range(extra_ids)]
|
|
94
|
+
elif extra_ids > 0 and extra_ids != len(extra_tokens):
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
|
|
97
|
+
" provided to LasrTokenizer. In this case the additional_special_tokens must include the extra_ids"
|
|
98
|
+
" tokens"
|
|
99
|
+
)
|
|
100
|
+
else:
|
|
101
|
+
extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
|
|
102
|
+
additional_special_tokens = extra_tokens
|
|
103
|
+
|
|
104
|
+
# LASR vocab structure: <pad>=0, </s>=1, <unk>=2, then regular vocab, then extra_ids in reverse
|
|
105
|
+
if vocab is not None:
|
|
106
|
+
self._vocab_scores = vocab
|
|
107
|
+
else:
|
|
108
|
+
self._vocab_scores = [
|
|
109
|
+
(str(pad_token), 0.0),
|
|
110
|
+
(str(eos_token), 0.0),
|
|
111
|
+
(str(unk_token), 0.0),
|
|
112
|
+
("▁", -2.0), # Space token
|
|
113
|
+
]
|
|
114
|
+
for i in range(extra_ids - 1, -1, -1):
|
|
115
|
+
self._vocab_scores.append((f"<extra_id_{i}>", 0.0))
|
|
116
|
+
self._tokenizer = Tokenizer(
|
|
117
|
+
Unigram(
|
|
118
|
+
self._vocab_scores,
|
|
119
|
+
unk_id=3,
|
|
120
|
+
byte_fallback=False,
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
self._tokenizer.normalizer = None
|
|
125
|
+
|
|
126
|
+
self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
|
127
|
+
[
|
|
128
|
+
pre_tokenizers.WhitespaceSplit(),
|
|
129
|
+
pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True),
|
|
130
|
+
]
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
|
|
134
|
+
|
|
135
|
+
super().__init__(
|
|
136
|
+
eos_token=eos_token,
|
|
137
|
+
unk_token=unk_token,
|
|
138
|
+
pad_token=pad_token,
|
|
139
|
+
extra_ids=extra_ids,
|
|
140
|
+
additional_special_tokens=additional_special_tokens,
|
|
141
|
+
**kwargs,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
self._tokenizer.post_processor = processors.TemplateProcessing(
|
|
145
|
+
single=["$A", "</s>"],
|
|
146
|
+
pair=["$A", "</s>", "$B", "</s>"],
|
|
147
|
+
special_tokens=[
|
|
148
|
+
("</s>", self.eos_token_id),
|
|
149
|
+
],
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def get_sentinel_tokens(self):
|
|
153
|
+
"""Get the list of sentinel tokens (extra_id tokens) from additional_special_tokens."""
|
|
154
|
+
return list(
|
|
155
|
+
set(filter(lambda x: bool(re.search(r"<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def get_sentinel_token_ids(self):
|
|
159
|
+
"""Get the token IDs for sentinel tokens."""
|
|
160
|
+
return [self.convert_tokens_to_ids(token) for token in self.get_sentinel_tokens()]
|
|
161
|
+
|
|
162
|
+
def _decode(
|
|
163
|
+
self,
|
|
164
|
+
token_ids: Union[int, list[int]],
|
|
165
|
+
skip_special_tokens: bool = False,
|
|
166
|
+
clean_up_tokenization_spaces: Optional[bool] = None,
|
|
167
|
+
group_tokens: bool = True,
|
|
168
|
+
**kwargs,
|
|
169
|
+
) -> str:
|
|
170
|
+
if isinstance(token_ids, int):
|
|
171
|
+
token_ids = [token_ids]
|
|
172
|
+
if group_tokens:
|
|
173
|
+
token_ids = [token_group[0] for token_group in itertools.groupby(token_ids)]
|
|
174
|
+
|
|
175
|
+
# for CTC we filter out the blank token, which is the pad token
|
|
176
|
+
token_ids = [token for token in token_ids if token != self.pad_token_id]
|
|
177
|
+
|
|
178
|
+
return super()._decode(
|
|
179
|
+
token_ids=token_ids,
|
|
180
|
+
skip_special_tokens=skip_special_tokens,
|
|
181
|
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
182
|
+
**kwargs,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
__all__ = ["LasrTokenizer"]
|
|
@@ -465,6 +465,7 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
|
|
|
465
465
|
output_attentions: Optional[bool] = None,
|
|
466
466
|
output_hidden_states: Optional[bool] = None,
|
|
467
467
|
return_dict: Optional[bool] = None,
|
|
468
|
+
**kwargs,
|
|
468
469
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
469
470
|
r"""
|
|
470
471
|
bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
|
|
@@ -600,6 +601,7 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
|
|
|
600
601
|
output_attentions: Optional[bool] = None,
|
|
601
602
|
output_hidden_states: Optional[bool] = None,
|
|
602
603
|
return_dict: Optional[bool] = None,
|
|
604
|
+
**kwargs,
|
|
603
605
|
) -> Union[tuple, MaskedLMOutput]:
|
|
604
606
|
r"""
|
|
605
607
|
bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
|
|
@@ -716,6 +718,7 @@ class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel):
|
|
|
716
718
|
output_attentions: Optional[bool] = None,
|
|
717
719
|
output_hidden_states: Optional[bool] = None,
|
|
718
720
|
return_dict: Optional[bool] = None,
|
|
721
|
+
**kwargs,
|
|
719
722
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
720
723
|
r"""
|
|
721
724
|
bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
|
|
@@ -850,6 +853,7 @@ class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
|
|
|
850
853
|
output_attentions: Optional[bool] = None,
|
|
851
854
|
output_hidden_states: Optional[bool] = None,
|
|
852
855
|
return_dict: Optional[bool] = None,
|
|
856
|
+
**kwargs,
|
|
853
857
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
854
858
|
r"""
|
|
855
859
|
bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
|
|
@@ -963,6 +967,7 @@ class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel):
|
|
|
963
967
|
output_attentions: Optional[bool] = None,
|
|
964
968
|
output_hidden_states: Optional[bool] = None,
|
|
965
969
|
return_dict: Optional[bool] = None,
|
|
970
|
+
**kwargs,
|
|
966
971
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
967
972
|
r"""
|
|
968
973
|
bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
|
|
@@ -701,6 +701,7 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
|
|
|
701
701
|
output_attentions: Optional[bool] = None,
|
|
702
702
|
output_hidden_states: Optional[bool] = None,
|
|
703
703
|
return_dict: Optional[bool] = None,
|
|
704
|
+
**kwargs,
|
|
704
705
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
705
706
|
r"""
|
|
706
707
|
bbox (`torch.LongTensor` of shape `((batch_size, sequence_length), 4)`, *optional*):
|
|
@@ -858,6 +859,7 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):
|
|
|
858
859
|
output_attentions: Optional[bool] = None,
|
|
859
860
|
output_hidden_states: Optional[bool] = None,
|
|
860
861
|
return_dict: Optional[bool] = None,
|
|
862
|
+
**kwargs,
|
|
861
863
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
862
864
|
r"""
|
|
863
865
|
input_ids (`torch.LongTensor` of shape `batch_size, sequence_length`):
|
|
@@ -1061,6 +1063,7 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):
|
|
|
1061
1063
|
output_attentions: Optional[bool] = None,
|
|
1062
1064
|
output_hidden_states: Optional[bool] = None,
|
|
1063
1065
|
return_dict: Optional[bool] = None,
|
|
1066
|
+
**kwargs,
|
|
1064
1067
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1065
1068
|
r"""
|
|
1066
1069
|
input_ids (`torch.LongTensor` of shape `batch_size, sequence_length`):
|
|
@@ -1212,6 +1215,7 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
|
|
|
1212
1215
|
output_attentions: Optional[bool] = None,
|
|
1213
1216
|
output_hidden_states: Optional[bool] = None,
|
|
1214
1217
|
return_dict: Optional[bool] = None,
|
|
1218
|
+
**kwargs,
|
|
1215
1219
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
1216
1220
|
r"""
|
|
1217
1221
|
input_ids (`torch.LongTensor` of shape `batch_size, sequence_length`):
|
|
@@ -159,22 +159,11 @@ class LayoutLMv2Tokenizer(TokenizersBackend):
|
|
|
159
159
|
"""
|
|
160
160
|
|
|
161
161
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
@staticmethod
|
|
165
|
-
def _load_vocab_from_file(vocab_file):
|
|
166
|
-
"""Load vocab from a BERT-style vocab file (one token per line)."""
|
|
167
|
-
vocab = {}
|
|
168
|
-
with open(vocab_file, "r", encoding="utf-8") as reader:
|
|
169
|
-
for index, line in enumerate(reader):
|
|
170
|
-
token = line.rstrip("\n")
|
|
171
|
-
vocab[token] = index
|
|
172
|
-
return vocab
|
|
162
|
+
model = models.WordPiece
|
|
173
163
|
|
|
174
164
|
def __init__(
|
|
175
165
|
self,
|
|
176
|
-
vocab=None,
|
|
177
|
-
vocab_file=None,
|
|
166
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
178
167
|
do_lower_case=True,
|
|
179
168
|
unk_token="[UNK]",
|
|
180
169
|
sep_token="[SEP]",
|
|
@@ -190,21 +179,12 @@ class LayoutLMv2Tokenizer(TokenizersBackend):
|
|
|
190
179
|
strip_accents=None,
|
|
191
180
|
**kwargs,
|
|
192
181
|
):
|
|
193
|
-
self.vocab_file = vocab_file
|
|
194
182
|
self.do_lower_case = do_lower_case
|
|
195
183
|
|
|
196
|
-
# Build vocab for WordPiece
|
|
197
184
|
if vocab is not None:
|
|
198
|
-
|
|
199
|
-
_vocab = vocab
|
|
200
|
-
else:
|
|
201
|
-
raise ValueError("vocab must be a dict mapping tokens to ids")
|
|
202
|
-
elif vocab_file is not None:
|
|
203
|
-
# Load vocab from file (BERT format: one token per line)
|
|
204
|
-
_vocab = self._load_vocab_from_file(vocab_file)
|
|
185
|
+
self._vocab = vocab
|
|
205
186
|
else:
|
|
206
|
-
|
|
207
|
-
_vocab = {
|
|
187
|
+
self._vocab = {
|
|
208
188
|
str(pad_token): 0,
|
|
209
189
|
str(unk_token): 1,
|
|
210
190
|
str(cls_token): 2,
|
|
@@ -212,10 +192,7 @@ class LayoutLMv2Tokenizer(TokenizersBackend):
|
|
|
212
192
|
str(mask_token): 4,
|
|
213
193
|
}
|
|
214
194
|
|
|
215
|
-
|
|
216
|
-
self._tokenizer = Tokenizer(models.WordPiece(vocab=_vocab, unk_token=str(unk_token)))
|
|
217
|
-
|
|
218
|
-
# Set normalizer
|
|
195
|
+
self._tokenizer = Tokenizer(models.WordPiece(vocab=self._vocab, unk_token=str(unk_token)))
|
|
219
196
|
self._tokenizer.normalizer = normalizers.BertNormalizer(
|
|
220
197
|
clean_text=True,
|
|
221
198
|
handle_chinese_chars=tokenize_chinese_chars,
|
|
@@ -223,27 +200,9 @@ class LayoutLMv2Tokenizer(TokenizersBackend):
|
|
|
223
200
|
lowercase=do_lower_case,
|
|
224
201
|
)
|
|
225
202
|
|
|
226
|
-
# Set pre_tokenizer
|
|
227
203
|
self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
|
228
|
-
|
|
229
|
-
# Set decoder
|
|
230
204
|
self._tokenizer.decoder = decoders.WordPiece(prefix="##")
|
|
231
|
-
|
|
232
|
-
# Set post_processor (will be set after super().__init__ when we have token IDs)
|
|
233
|
-
# Temporarily set to None, will be configured after parent init
|
|
234
|
-
self._tokenizer.post_processor = None
|
|
235
|
-
|
|
236
|
-
tokenizer_object = self._tokenizer
|
|
237
|
-
|
|
238
|
-
# additional properties
|
|
239
|
-
self.cls_token_box = cls_token_box
|
|
240
|
-
self.sep_token_box = sep_token_box
|
|
241
|
-
self.pad_token_box = pad_token_box
|
|
242
|
-
self.pad_token_label = pad_token_label
|
|
243
|
-
self.only_label_first_subword = only_label_first_subword
|
|
244
|
-
|
|
245
205
|
super().__init__(
|
|
246
|
-
tokenizer_object=tokenizer_object,
|
|
247
206
|
do_lower_case=do_lower_case,
|
|
248
207
|
unk_token=unk_token,
|
|
249
208
|
sep_token=sep_token,
|
|
@@ -260,6 +219,11 @@ class LayoutLMv2Tokenizer(TokenizersBackend):
|
|
|
260
219
|
**kwargs,
|
|
261
220
|
)
|
|
262
221
|
|
|
222
|
+
self.cls_token_box = cls_token_box
|
|
223
|
+
self.sep_token_box = sep_token_box
|
|
224
|
+
self.pad_token_box = pad_token_box
|
|
225
|
+
self.pad_token_label = pad_token_label
|
|
226
|
+
|
|
263
227
|
# Now set post_processor with actual token IDs
|
|
264
228
|
cls = str(self.cls_token)
|
|
265
229
|
sep = str(self.sep_token)
|
|
@@ -275,13 +239,6 @@ class LayoutLMv2Tokenizer(TokenizersBackend):
|
|
|
275
239
|
],
|
|
276
240
|
)
|
|
277
241
|
|
|
278
|
-
# additional properties
|
|
279
|
-
self.cls_token_box = cls_token_box
|
|
280
|
-
self.sep_token_box = sep_token_box
|
|
281
|
-
self.pad_token_box = pad_token_box
|
|
282
|
-
self.pad_token_label = pad_token_label
|
|
283
|
-
self.only_label_first_subword = only_label_first_subword
|
|
284
|
-
|
|
285
242
|
@add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
|
|
286
243
|
def __call__(
|
|
287
244
|
self,
|
|
@@ -657,6 +657,7 @@ class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
|
|
|
657
657
|
output_attentions: Optional[bool] = None,
|
|
658
658
|
output_hidden_states: Optional[bool] = None,
|
|
659
659
|
return_dict: Optional[bool] = None,
|
|
660
|
+
**kwargs,
|
|
660
661
|
) -> Union[tuple, BaseModelOutput]:
|
|
661
662
|
r"""
|
|
662
663
|
input_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`):
|
|
@@ -897,6 +898,7 @@ class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
|
|
|
897
898
|
output_hidden_states: Optional[bool] = None,
|
|
898
899
|
return_dict: Optional[bool] = None,
|
|
899
900
|
pixel_values: Optional[torch.LongTensor] = None,
|
|
901
|
+
**kwargs,
|
|
900
902
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
901
903
|
r"""
|
|
902
904
|
bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
|
|
@@ -997,6 +999,7 @@ class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
|
|
|
997
999
|
return_dict: Optional[bool] = None,
|
|
998
1000
|
bbox: Optional[torch.LongTensor] = None,
|
|
999
1001
|
pixel_values: Optional[torch.LongTensor] = None,
|
|
1002
|
+
**kwargs,
|
|
1000
1003
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
1001
1004
|
r"""
|
|
1002
1005
|
bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
|
|
@@ -1115,6 +1118,7 @@ class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
|
|
|
1115
1118
|
return_dict: Optional[bool] = None,
|
|
1116
1119
|
bbox: Optional[torch.LongTensor] = None,
|
|
1117
1120
|
pixel_values: Optional[torch.LongTensor] = None,
|
|
1121
|
+
**kwargs,
|
|
1118
1122
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1119
1123
|
r"""
|
|
1120
1124
|
bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""Tokenization class for LayoutLMv3. Same as LayoutLMv2, but RoBERTa-like BPE tokenization instead of WordPiece."""
|
|
16
16
|
|
|
17
|
-
import json
|
|
18
17
|
from typing import Optional, Union
|
|
19
18
|
|
|
20
19
|
from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors
|
|
@@ -159,15 +158,16 @@ class LayoutLMv3Tokenizer(TokenizersBackend):
|
|
|
159
158
|
CrossEntropyLoss.
|
|
160
159
|
only_label_first_subword (`bool`, *optional*, defaults to `True`):
|
|
161
160
|
Whether or not to only label the first subword, in case word labels are provided.
|
|
162
|
-
vocab (`dict`, *optional*):
|
|
163
|
-
Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file when using
|
|
164
|
-
|
|
165
|
-
|
|
161
|
+
vocab (`str` or `dict[str, int]`, *optional*):
|
|
162
|
+
Custom vocabulary dictionary. If not provided, vocabulary is loaded from `vocab_file` when using
|
|
163
|
+
`from_pretrained`.
|
|
164
|
+
merges (`str` or `list[str]`, *optional*):
|
|
165
|
+
Custom merges list. If not provided, merges are loaded from `merges_file` when using `from_pretrained`.
|
|
166
166
|
"""
|
|
167
167
|
|
|
168
168
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
169
169
|
model_input_names = ["input_ids", "attention_mask", "bbox"]
|
|
170
|
-
|
|
170
|
+
model = models.BPE
|
|
171
171
|
|
|
172
172
|
def __init__(
|
|
173
173
|
self,
|
|
@@ -185,69 +185,26 @@ class LayoutLMv3Tokenizer(TokenizersBackend):
|
|
|
185
185
|
pad_token_box=[0, 0, 0, 0],
|
|
186
186
|
pad_token_label=-100,
|
|
187
187
|
only_label_first_subword=True,
|
|
188
|
-
vocab: Optional[dict] = None,
|
|
189
|
-
merges: Optional[list] = None,
|
|
190
|
-
vocab_file: Optional[str] = None,
|
|
191
|
-
merges_file: Optional[str] = None,
|
|
188
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
189
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
192
190
|
**kwargs,
|
|
193
191
|
):
|
|
194
192
|
self.add_prefix_space = add_prefix_space
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
# Priority: 1) vocab/merges dicts/lists, 2) vocab_file/merges_file paths, 3) empty
|
|
198
|
-
if vocab is not None:
|
|
199
|
-
_vocab = vocab
|
|
200
|
-
elif vocab_file is not None:
|
|
201
|
-
with open(vocab_file, encoding="utf-8") as f:
|
|
202
|
-
_vocab = json.load(f)
|
|
203
|
-
else:
|
|
204
|
-
_vocab = {}
|
|
205
|
-
|
|
206
|
-
if merges is not None:
|
|
207
|
-
_merges = merges
|
|
208
|
-
elif merges_file is not None:
|
|
209
|
-
_merges = []
|
|
210
|
-
with open(merges_file, encoding="utf-8") as f:
|
|
211
|
-
for line in f:
|
|
212
|
-
line = line.strip()
|
|
213
|
-
if line and not line.startswith("#"):
|
|
214
|
-
_merges.append(tuple(line.split()))
|
|
215
|
-
else:
|
|
216
|
-
_merges = []
|
|
217
|
-
|
|
218
|
-
# Initialize BPE tokenizer
|
|
193
|
+
self._vocab = vocab or {}
|
|
194
|
+
self._merges = merges or []
|
|
219
195
|
self._tokenizer = Tokenizer(
|
|
220
196
|
models.BPE(
|
|
221
|
-
vocab=_vocab,
|
|
222
|
-
merges=_merges,
|
|
197
|
+
vocab=self._vocab,
|
|
198
|
+
merges=self._merges,
|
|
223
199
|
dropout=None,
|
|
224
200
|
continuing_subword_prefix="",
|
|
225
201
|
end_of_word_suffix="",
|
|
226
202
|
fuse_unk=False,
|
|
227
203
|
)
|
|
228
204
|
)
|
|
229
|
-
|
|
230
|
-
# Set pre_tokenizer (ByteLevel)
|
|
231
205
|
self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
|
|
232
|
-
|
|
233
|
-
# Set decoder
|
|
234
206
|
self._tokenizer.decoder = decoders.ByteLevel()
|
|
235
|
-
|
|
236
|
-
# Set post_processor (will be set after super().__init__ when we have token IDs)
|
|
237
|
-
# Temporarily set to None, will be configured after parent init
|
|
238
|
-
self._tokenizer.post_processor = None
|
|
239
|
-
|
|
240
|
-
tokenizer_object = self._tokenizer
|
|
241
|
-
|
|
242
|
-
# additional properties
|
|
243
|
-
self.cls_token_box = cls_token_box
|
|
244
|
-
self.sep_token_box = sep_token_box
|
|
245
|
-
self.pad_token_box = pad_token_box
|
|
246
|
-
self.pad_token_label = pad_token_label
|
|
247
|
-
self.only_label_first_subword = only_label_first_subword
|
|
248
|
-
|
|
249
207
|
super().__init__(
|
|
250
|
-
tokenizer_object=tokenizer_object,
|
|
251
208
|
errors=errors,
|
|
252
209
|
bos_token=bos_token,
|
|
253
210
|
eos_token=eos_token,
|
|
@@ -277,18 +234,12 @@ class LayoutLMv3Tokenizer(TokenizersBackend):
|
|
|
277
234
|
add_prefix_space=add_prefix_space,
|
|
278
235
|
trim_offsets=True,
|
|
279
236
|
)
|
|
280
|
-
|
|
281
|
-
# additional properties
|
|
282
237
|
self.cls_token_box = cls_token_box
|
|
283
238
|
self.sep_token_box = sep_token_box
|
|
284
239
|
self.pad_token_box = pad_token_box
|
|
285
240
|
self.pad_token_label = pad_token_label
|
|
286
241
|
self.only_label_first_subword = only_label_first_subword
|
|
287
242
|
|
|
288
|
-
# Call _post_init for tokenizers created directly (not from_pretrained)
|
|
289
|
-
# For from_pretrained, this will be called again after loading the tokenizer from file
|
|
290
|
-
self._post_init()
|
|
291
|
-
|
|
292
243
|
@add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
|
|
293
244
|
def __call__(
|
|
294
245
|
self,
|
|
@@ -150,8 +150,8 @@ class LayoutXLMTokenizer(TokenizersBackend):
|
|
|
150
150
|
refer to this superclass for more information regarding those methods.
|
|
151
151
|
|
|
152
152
|
Args:
|
|
153
|
-
vocab (`
|
|
154
|
-
Vocabulary for the tokenizer as a list of (token, score) tuples.
|
|
153
|
+
vocab (`str`, `dict` or `list`, *optional*):
|
|
154
|
+
Vocabulary for the tokenizer as a path, a dictionary or a list of `(token, score)` tuples.
|
|
155
155
|
bos_token (`str`, *optional*, defaults to `"<s>"`):
|
|
156
156
|
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
|
157
157
|
|
|
@@ -206,12 +206,11 @@ class LayoutXLMTokenizer(TokenizersBackend):
|
|
|
206
206
|
|
|
207
207
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
208
208
|
model_input_names = ["input_ids", "attention_mask"]
|
|
209
|
-
|
|
209
|
+
model = Unigram
|
|
210
210
|
|
|
211
211
|
def __init__(
|
|
212
212
|
self,
|
|
213
|
-
|
|
214
|
-
vocab=None,
|
|
213
|
+
vocab: Optional[Union[str, list]] = None,
|
|
215
214
|
bos_token="<s>",
|
|
216
215
|
eos_token="</s>",
|
|
217
216
|
sep_token="</s>",
|
|
@@ -229,17 +228,10 @@ class LayoutXLMTokenizer(TokenizersBackend):
|
|
|
229
228
|
):
|
|
230
229
|
# Mask token behave like a normal word, i.e. include the space before it
|
|
231
230
|
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
|
232
|
-
|
|
233
231
|
self.add_prefix_space = add_prefix_space
|
|
234
232
|
|
|
235
|
-
# Build vocab from list of tuples if provided, else use default
|
|
236
|
-
# Handle both list of tuples (when creating) and dict (when loading)
|
|
237
233
|
if vocab is not None:
|
|
238
|
-
|
|
239
|
-
# Convert dict to list of tuples
|
|
240
|
-
self._vocab = [(token, score) for token, score in vocab.items()]
|
|
241
|
-
else:
|
|
242
|
-
self._vocab = vocab
|
|
234
|
+
self._vocab = vocab
|
|
243
235
|
else:
|
|
244
236
|
self._vocab = [
|
|
245
237
|
("<s>", 0.0),
|
|
@@ -250,10 +242,7 @@ class LayoutXLMTokenizer(TokenizersBackend):
|
|
|
250
242
|
if mask_token not in [v[0] for v in self._vocab]:
|
|
251
243
|
self._vocab.append((str(mask_token), 0.0))
|
|
252
244
|
|
|
253
|
-
# Create the Unigram tokenizer
|
|
254
245
|
self._tokenizer = Tokenizer(Unigram(self._vocab, unk_id=3, byte_fallback=False))
|
|
255
|
-
|
|
256
|
-
# Set up normalizer (strip right, replace multiple spaces)
|
|
257
246
|
self._tokenizer.normalizer = normalizers.Sequence(
|
|
258
247
|
[
|
|
259
248
|
normalizers.Strip(left=False, right=True),
|
|
@@ -261,30 +250,11 @@ class LayoutXLMTokenizer(TokenizersBackend):
|
|
|
261
250
|
]
|
|
262
251
|
)
|
|
263
252
|
|
|
264
|
-
# Set up pre_tokenizer (Metaspace)
|
|
265
253
|
prepend_scheme = _get_prepend_scheme(add_prefix_space, self)
|
|
266
254
|
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
|
|
267
255
|
|
|
268
|
-
# Set up decoder
|
|
269
256
|
self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
|
|
270
257
|
|
|
271
|
-
# Set up post_processor for XLM-RoBERTa style
|
|
272
|
-
# Get token IDs
|
|
273
|
-
cls_token_id = self._get_token_id(str(cls_token))
|
|
274
|
-
sep_token_id = self._get_token_id(str(sep_token))
|
|
275
|
-
|
|
276
|
-
self._tokenizer.post_processor = processors.TemplateProcessing(
|
|
277
|
-
single="<s> $A </s>",
|
|
278
|
-
pair="<s> $A </s> </s> $B </s>",
|
|
279
|
-
special_tokens=[
|
|
280
|
-
("<s>", cls_token_id),
|
|
281
|
-
("</s>", sep_token_id),
|
|
282
|
-
],
|
|
283
|
-
)
|
|
284
|
-
|
|
285
|
-
tokenizer_object = self._tokenizer
|
|
286
|
-
|
|
287
|
-
# additional properties
|
|
288
258
|
self.cls_token_box = cls_token_box
|
|
289
259
|
self.sep_token_box = sep_token_box
|
|
290
260
|
self.pad_token_box = pad_token_box
|
|
@@ -292,7 +262,6 @@ class LayoutXLMTokenizer(TokenizersBackend):
|
|
|
292
262
|
self.only_label_first_subword = only_label_first_subword
|
|
293
263
|
|
|
294
264
|
super().__init__(
|
|
295
|
-
tokenizer_object=tokenizer_object,
|
|
296
265
|
bos_token=bos_token,
|
|
297
266
|
eos_token=eos_token,
|
|
298
267
|
sep_token=sep_token,
|
|
@@ -300,7 +269,6 @@ class LayoutXLMTokenizer(TokenizersBackend):
|
|
|
300
269
|
unk_token=unk_token,
|
|
301
270
|
pad_token=pad_token,
|
|
302
271
|
mask_token=mask_token,
|
|
303
|
-
vocab_file=vocab_file,
|
|
304
272
|
vocab=vocab,
|
|
305
273
|
add_prefix_space=add_prefix_space,
|
|
306
274
|
cls_token_box=cls_token_box,
|
|
@@ -311,7 +279,14 @@ class LayoutXLMTokenizer(TokenizersBackend):
|
|
|
311
279
|
**kwargs,
|
|
312
280
|
)
|
|
313
281
|
|
|
314
|
-
self.
|
|
282
|
+
self._tokenizer.post_processor = processors.TemplateProcessing(
|
|
283
|
+
single=f"{str(self.cls_token)}:0 $A:0 {str(self.sep_token)}:0",
|
|
284
|
+
pair=f"{str(self.cls_token)}:0 $A:0 {str(self.sep_token)}:0 {str(self.sep_token)}:0 $B:0 {str(self.sep_token)}:0",
|
|
285
|
+
special_tokens=[
|
|
286
|
+
(str(self.cls_token), self.cls_token_id),
|
|
287
|
+
(str(self.sep_token), self.sep_token_id),
|
|
288
|
+
],
|
|
289
|
+
)
|
|
315
290
|
|
|
316
291
|
def _get_token_id(self, token: str) -> int:
|
|
317
292
|
"""Helper to get token ID from vocab."""
|