transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
"""Tokenization classes for MPNet."""
|
|
17
17
|
|
|
18
|
-
from typing import Optional
|
|
18
|
+
from typing import Optional, Union
|
|
19
19
|
|
|
20
20
|
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
|
|
21
21
|
from tokenizers.models import WordPiece
|
|
@@ -38,7 +38,7 @@ class MPNetTokenizer(TokenizersBackend):
|
|
|
38
38
|
refer to this superclass for more information regarding those methods.
|
|
39
39
|
|
|
40
40
|
Args:
|
|
41
|
-
vocab (`dict`, *optional*):
|
|
41
|
+
vocab (`str` or `dict[str, int]`, *optional*):
|
|
42
42
|
Dictionary mapping tokens to their IDs. If not provided, an empty vocab is initialized.
|
|
43
43
|
do_lower_case (`bool`, *optional*, defaults to `True`):
|
|
44
44
|
Whether or not to lowercase the input when tokenizing.
|
|
@@ -87,10 +87,11 @@ class MPNetTokenizer(TokenizersBackend):
|
|
|
87
87
|
|
|
88
88
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
89
89
|
model_input_names = ["input_ids", "attention_mask"]
|
|
90
|
+
model = WordPiece
|
|
90
91
|
|
|
91
92
|
def __init__(
|
|
92
93
|
self,
|
|
93
|
-
vocab: Optional[dict] = None,
|
|
94
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
94
95
|
do_lower_case=True,
|
|
95
96
|
bos_token="<s>",
|
|
96
97
|
eos_token="</s>",
|
|
@@ -104,12 +105,7 @@ class MPNetTokenizer(TokenizersBackend):
|
|
|
104
105
|
**kwargs,
|
|
105
106
|
):
|
|
106
107
|
# Initialize vocab
|
|
107
|
-
if vocab is not None
|
|
108
|
-
self._vocab = (
|
|
109
|
-
{token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
|
|
110
|
-
)
|
|
111
|
-
else:
|
|
112
|
-
self._vocab = {}
|
|
108
|
+
self._vocab = vocab if vocab is not None else {}
|
|
113
109
|
|
|
114
110
|
# Initialize the tokenizer with WordPiece model
|
|
115
111
|
self._tokenizer = Tokenizer(WordPiece(self._vocab, unk_token=str(unk_token)))
|
|
@@ -142,11 +138,7 @@ class MPNetTokenizer(TokenizersBackend):
|
|
|
142
138
|
# Mask token behave like a normal word, i.e. include the space before it
|
|
143
139
|
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
|
144
140
|
|
|
145
|
-
# Store for later use
|
|
146
|
-
tokenizer_object = self._tokenizer
|
|
147
|
-
|
|
148
141
|
super().__init__(
|
|
149
|
-
tokenizer_object=tokenizer_object,
|
|
150
142
|
do_lower_case=do_lower_case,
|
|
151
143
|
bos_token=bos_token,
|
|
152
144
|
eos_token=eos_token,
|
|
@@ -498,6 +498,7 @@ class MptForSequenceClassification(MptPreTrainedModel):
|
|
|
498
498
|
output_attentions: Optional[bool] = None,
|
|
499
499
|
output_hidden_states: Optional[bool] = None,
|
|
500
500
|
return_dict: Optional[bool] = None,
|
|
501
|
+
**kwargs,
|
|
501
502
|
) -> Union[tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
|
|
502
503
|
r"""
|
|
503
504
|
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
|
|
@@ -700,6 +701,7 @@ class MptForQuestionAnswering(MptPreTrainedModel):
|
|
|
700
701
|
output_attentions: Optional[bool] = None,
|
|
701
702
|
output_hidden_states: Optional[bool] = None,
|
|
702
703
|
return_dict: Optional[bool] = None,
|
|
704
|
+
**kwargs,
|
|
703
705
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
704
706
|
r"""
|
|
705
707
|
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
|
|
@@ -54,7 +54,7 @@ def load_cuda_kernels():
|
|
|
54
54
|
global mra_cuda_kernel
|
|
55
55
|
if not is_kernels_available():
|
|
56
56
|
raise ImportError("kernels is not installed, please install it with `pip install kernels`")
|
|
57
|
-
from
|
|
57
|
+
from ...integrations.hub_kernels import get_kernel
|
|
58
58
|
|
|
59
59
|
mra_cuda_kernel = get_kernel("kernels-community/mra")
|
|
60
60
|
|
|
@@ -796,6 +796,9 @@ class MraPreTrainedModel(PreTrainedModel):
|
|
|
796
796
|
super()._init_weights(module)
|
|
797
797
|
if isinstance(module, MraLMPredictionHead):
|
|
798
798
|
init.zeros_(module.bias)
|
|
799
|
+
elif isinstance(module, MraEmbeddings):
|
|
800
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)) + 2)
|
|
801
|
+
init.zeros_(module.token_type_ids)
|
|
799
802
|
|
|
800
803
|
|
|
801
804
|
@auto_docstring
|
|
@@ -826,6 +829,7 @@ class MraModel(MraPreTrainedModel):
|
|
|
826
829
|
inputs_embeds: Optional[torch.Tensor] = None,
|
|
827
830
|
output_hidden_states: Optional[bool] = None,
|
|
828
831
|
return_dict: Optional[bool] = None,
|
|
832
|
+
**kwargs,
|
|
829
833
|
) -> Union[tuple, BaseModelOutputWithCrossAttentions]:
|
|
830
834
|
output_hidden_states = (
|
|
831
835
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
@@ -919,6 +923,7 @@ class MraForMaskedLM(MraPreTrainedModel):
|
|
|
919
923
|
labels: Optional[torch.Tensor] = None,
|
|
920
924
|
output_hidden_states: Optional[bool] = None,
|
|
921
925
|
return_dict: Optional[bool] = None,
|
|
926
|
+
**kwargs,
|
|
922
927
|
) -> Union[tuple, MaskedLMOutput]:
|
|
923
928
|
r"""
|
|
924
929
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1007,6 +1012,7 @@ class MraForSequenceClassification(MraPreTrainedModel):
|
|
|
1007
1012
|
labels: Optional[torch.Tensor] = None,
|
|
1008
1013
|
output_hidden_states: Optional[bool] = None,
|
|
1009
1014
|
return_dict: Optional[bool] = None,
|
|
1015
|
+
**kwargs,
|
|
1010
1016
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1011
1017
|
r"""
|
|
1012
1018
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1086,6 +1092,7 @@ class MraForMultipleChoice(MraPreTrainedModel):
|
|
|
1086
1092
|
labels: Optional[torch.Tensor] = None,
|
|
1087
1093
|
output_hidden_states: Optional[bool] = None,
|
|
1088
1094
|
return_dict: Optional[bool] = None,
|
|
1095
|
+
**kwargs,
|
|
1089
1096
|
) -> Union[tuple, MultipleChoiceModelOutput]:
|
|
1090
1097
|
r"""
|
|
1091
1098
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -1189,6 +1196,7 @@ class MraForTokenClassification(MraPreTrainedModel):
|
|
|
1189
1196
|
labels: Optional[torch.Tensor] = None,
|
|
1190
1197
|
output_hidden_states: Optional[bool] = None,
|
|
1191
1198
|
return_dict: Optional[bool] = None,
|
|
1199
|
+
**kwargs,
|
|
1192
1200
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1193
1201
|
r"""
|
|
1194
1202
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1263,6 +1271,7 @@ class MraForQuestionAnswering(MraPreTrainedModel):
|
|
|
1263
1271
|
end_positions: Optional[torch.Tensor] = None,
|
|
1264
1272
|
output_hidden_states: Optional[bool] = None,
|
|
1265
1273
|
return_dict: Optional[bool] = None,
|
|
1274
|
+
**kwargs,
|
|
1266
1275
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
1267
1276
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
1268
1277
|
|
|
@@ -133,17 +133,16 @@ class MT5Config(PreTrainedConfig):
|
|
|
133
133
|
if feed_forward_proj == "gated-gelu":
|
|
134
134
|
self.dense_act_fn = "gelu_new"
|
|
135
135
|
|
|
136
|
+
# Force because official weights have False serialized, but we have to tie always
|
|
137
|
+
kwargs["tie_word_embeddings"] = True
|
|
136
138
|
super().__init__(
|
|
137
139
|
is_encoder_decoder=is_encoder_decoder,
|
|
138
140
|
tokenizer_class=tokenizer_class,
|
|
139
|
-
tie_word_embeddings=tie_word_embeddings,
|
|
140
141
|
pad_token_id=pad_token_id,
|
|
141
142
|
eos_token_id=eos_token_id,
|
|
142
143
|
decoder_start_token_id=decoder_start_token_id,
|
|
143
144
|
**kwargs,
|
|
144
145
|
)
|
|
145
|
-
# TODO: Mt5 never supported not tying encoder decoder so this has to be true.
|
|
146
|
-
self.tie_encoder_decoder = True
|
|
147
146
|
|
|
148
147
|
|
|
149
148
|
__all__ = ["MT5Config"]
|
|
@@ -671,6 +671,7 @@ class MT5Stack(MT5PreTrainedModel):
|
|
|
671
671
|
output_hidden_states=None,
|
|
672
672
|
return_dict=None,
|
|
673
673
|
cache_position=None,
|
|
674
|
+
**kwargs,
|
|
674
675
|
):
|
|
675
676
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
|
676
677
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
@@ -859,12 +860,10 @@ class MT5Model(MT5PreTrainedModel):
|
|
|
859
860
|
encoder_config = copy.deepcopy(config)
|
|
860
861
|
encoder_config.is_decoder = False
|
|
861
862
|
encoder_config.use_cache = False
|
|
862
|
-
encoder_config.tie_encoder_decoder = False
|
|
863
863
|
self.encoder = MT5Stack(encoder_config)
|
|
864
864
|
|
|
865
865
|
decoder_config = copy.deepcopy(config)
|
|
866
866
|
decoder_config.is_decoder = True
|
|
867
|
-
decoder_config.tie_encoder_decoder = False
|
|
868
867
|
decoder_config.num_layers = config.num_decoder_layers
|
|
869
868
|
self.decoder = MT5Stack(decoder_config)
|
|
870
869
|
|
|
@@ -898,6 +897,7 @@ class MT5Model(MT5PreTrainedModel):
|
|
|
898
897
|
output_hidden_states: Optional[bool] = None,
|
|
899
898
|
return_dict: Optional[bool] = None,
|
|
900
899
|
cache_position: Optional[torch.LongTensor] = None,
|
|
900
|
+
**kwargs,
|
|
901
901
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
|
|
902
902
|
r"""
|
|
903
903
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1041,12 +1041,10 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
|
|
|
1041
1041
|
encoder_config = copy.deepcopy(config)
|
|
1042
1042
|
encoder_config.is_decoder = False
|
|
1043
1043
|
encoder_config.use_cache = False
|
|
1044
|
-
encoder_config.tie_encoder_decoder = False
|
|
1045
1044
|
self.encoder = MT5Stack(encoder_config)
|
|
1046
1045
|
|
|
1047
1046
|
decoder_config = copy.deepcopy(config)
|
|
1048
1047
|
decoder_config.is_decoder = True
|
|
1049
|
-
decoder_config.tie_encoder_decoder = False
|
|
1050
1048
|
decoder_config.num_layers = config.num_decoder_layers
|
|
1051
1049
|
self.decoder = MT5Stack(decoder_config)
|
|
1052
1050
|
|
|
@@ -1064,7 +1062,6 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
|
|
|
1064
1062
|
self.decoder.set_input_embeddings(new_embeddings)
|
|
1065
1063
|
|
|
1066
1064
|
@auto_docstring
|
|
1067
|
-
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with google-t5/->google/, T5->MT5, t5->mt5
|
|
1068
1065
|
def forward(
|
|
1069
1066
|
self,
|
|
1070
1067
|
input_ids: Optional[torch.LongTensor] = None,
|
|
@@ -1081,6 +1078,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
|
|
|
1081
1078
|
output_hidden_states: Optional[bool] = None,
|
|
1082
1079
|
return_dict: Optional[bool] = None,
|
|
1083
1080
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1081
|
+
**kwargs,
|
|
1084
1082
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
|
|
1085
1083
|
r"""
|
|
1086
1084
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1181,9 +1179,6 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin):
|
|
|
1181
1179
|
|
|
1182
1180
|
sequence_output = decoder_outputs[0]
|
|
1183
1181
|
|
|
1184
|
-
if self.config.tie_word_embeddings:
|
|
1185
|
-
sequence_output = sequence_output * (self.model_dim**-0.5)
|
|
1186
|
-
|
|
1187
1182
|
lm_logits = self.lm_head(sequence_output)
|
|
1188
1183
|
|
|
1189
1184
|
loss = None
|
|
@@ -1268,6 +1263,7 @@ class MT5EncoderModel(MT5PreTrainedModel):
|
|
|
1268
1263
|
output_attentions: Optional[bool] = None,
|
|
1269
1264
|
output_hidden_states: Optional[bool] = None,
|
|
1270
1265
|
return_dict: Optional[bool] = None,
|
|
1266
|
+
**kwargs,
|
|
1271
1267
|
) -> Union[tuple[torch.FloatTensor], BaseModelOutput]:
|
|
1272
1268
|
r"""
|
|
1273
1269
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1340,6 +1336,7 @@ class MT5ForSequenceClassification(MT5PreTrainedModel):
|
|
|
1340
1336
|
output_attentions: Optional[bool] = None,
|
|
1341
1337
|
output_hidden_states: Optional[bool] = None,
|
|
1342
1338
|
return_dict: Optional[bool] = None,
|
|
1339
|
+
**kwargs,
|
|
1343
1340
|
) -> Union[tuple, Seq2SeqSequenceClassifierOutput]:
|
|
1344
1341
|
r"""
|
|
1345
1342
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1480,6 +1477,7 @@ class MT5ForTokenClassification(MT5PreTrainedModel):
|
|
|
1480
1477
|
output_attentions: Optional[bool] = None,
|
|
1481
1478
|
output_hidden_states: Optional[bool] = None,
|
|
1482
1479
|
return_dict: Optional[bool] = None,
|
|
1480
|
+
**kwargs,
|
|
1483
1481
|
) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
|
|
1484
1482
|
r"""
|
|
1485
1483
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1545,12 +1543,10 @@ class MT5ForQuestionAnswering(MT5PreTrainedModel):
|
|
|
1545
1543
|
encoder_config = copy.deepcopy(config)
|
|
1546
1544
|
encoder_config.is_decoder = False
|
|
1547
1545
|
encoder_config.use_cache = False
|
|
1548
|
-
encoder_config.tie_encoder_decoder = False
|
|
1549
1546
|
self.encoder = MT5Stack(encoder_config)
|
|
1550
1547
|
|
|
1551
1548
|
decoder_config = copy.deepcopy(config)
|
|
1552
1549
|
decoder_config.is_decoder = True
|
|
1553
|
-
decoder_config.tie_encoder_decoder = False
|
|
1554
1550
|
decoder_config.num_layers = config.num_decoder_layers
|
|
1555
1551
|
self.decoder = MT5Stack(decoder_config)
|
|
1556
1552
|
|
|
@@ -1587,6 +1583,7 @@ class MT5ForQuestionAnswering(MT5PreTrainedModel):
|
|
|
1587
1583
|
output_attentions: Optional[bool] = None,
|
|
1588
1584
|
output_hidden_states: Optional[bool] = None,
|
|
1589
1585
|
return_dict: Optional[bool] = None,
|
|
1586
|
+
**kwargs,
|
|
1590
1587
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqQuestionAnsweringModelOutput]:
|
|
1591
1588
|
r"""
|
|
1592
1589
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -117,6 +117,7 @@ class MusicgenSinusoidalPositionalEmbedding(nn.Module):
|
|
|
117
117
|
def __init__(self, num_positions: int, embedding_dim: int):
|
|
118
118
|
super().__init__()
|
|
119
119
|
self.embedding_dim = embedding_dim
|
|
120
|
+
self.num_positions = num_positions
|
|
120
121
|
self.make_weights(num_positions, embedding_dim)
|
|
121
122
|
|
|
122
123
|
def make_weights(self, num_embeddings: int, embedding_dim: int):
|
|
@@ -432,6 +433,9 @@ class MusicgenPreTrainedModel(PreTrainedModel):
|
|
|
432
433
|
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
|
|
433
434
|
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
|
|
434
435
|
init.zeros_(module.weight[module.padding_idx])
|
|
436
|
+
elif isinstance(module, MusicgenSinusoidalPositionalEmbedding):
|
|
437
|
+
emb_weights = module.get_embedding(module.num_positions, module.embedding_dim)
|
|
438
|
+
init.copy_(module.weights, emb_weights)
|
|
435
439
|
|
|
436
440
|
|
|
437
441
|
class MusicgenDecoder(MusicgenPreTrainedModel):
|
|
@@ -482,6 +486,7 @@ class MusicgenDecoder(MusicgenPreTrainedModel):
|
|
|
482
486
|
output_hidden_states: Optional[bool] = None,
|
|
483
487
|
return_dict: Optional[bool] = None,
|
|
484
488
|
cache_position: Optional[torch.Tensor] = None,
|
|
489
|
+
**kwargs,
|
|
485
490
|
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
|
486
491
|
r"""
|
|
487
492
|
input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
|
|
@@ -716,6 +721,7 @@ class MusicgenModel(MusicgenPreTrainedModel):
|
|
|
716
721
|
output_hidden_states: Optional[bool] = None,
|
|
717
722
|
return_dict: Optional[bool] = None,
|
|
718
723
|
cache_position: Optional[torch.Tensor] = None,
|
|
724
|
+
**kwargs,
|
|
719
725
|
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
|
720
726
|
r"""
|
|
721
727
|
input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
|
|
@@ -2080,7 +2086,6 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
|
|
|
2080
2086
|
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
|
2081
2087
|
synced_gpus: Optional[bool] = None,
|
|
2082
2088
|
streamer: Optional["BaseStreamer"] = None,
|
|
2083
|
-
use_model_defaults: Optional[bool] = None,
|
|
2084
2089
|
**kwargs,
|
|
2085
2090
|
):
|
|
2086
2091
|
"""
|
|
@@ -2125,11 +2130,6 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
|
|
|
2125
2130
|
streamer (`BaseStreamer`, *optional*):
|
|
2126
2131
|
Streamer object that will be used to stream the generated sequences. Generated tokens are passed
|
|
2127
2132
|
through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
|
|
2128
|
-
use_model_defaults (`bool`, *optional*):
|
|
2129
|
-
When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
|
|
2130
|
-
generation configuration (`model.generation_config`), as opposed to the global defaults
|
|
2131
|
-
(`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
|
|
2132
|
-
`True`.
|
|
2133
2133
|
kwargs (`dict[str, Any]`, *optional*):
|
|
2134
2134
|
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
|
|
2135
2135
|
forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
|
|
@@ -2153,9 +2153,7 @@ class MusicgenForConditionalGeneration(MusicgenPreTrainedModel, GenerationMixin)
|
|
|
2153
2153
|
"""
|
|
2154
2154
|
# 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
|
|
2155
2155
|
generation_mode_kwargs = self._extract_generation_mode_kwargs(None, kwargs, False, None, None)
|
|
2156
|
-
generation_config, model_kwargs = self._prepare_generation_config(
|
|
2157
|
-
generation_config, use_model_defaults, **kwargs
|
|
2158
|
-
)
|
|
2156
|
+
generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
|
|
2159
2157
|
generation_mode = generation_config.get_generation_mode()
|
|
2160
2158
|
if generation_mode not in [GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH]:
|
|
2161
2159
|
raise ValueError(
|
|
@@ -122,6 +122,7 @@ class MusicgenMelodySinusoidalPositionalEmbedding(nn.Module):
|
|
|
122
122
|
def __init__(self, num_positions: int, embedding_dim: int):
|
|
123
123
|
super().__init__()
|
|
124
124
|
self.embedding_dim = embedding_dim
|
|
125
|
+
self.num_positions = num_positions
|
|
125
126
|
self.make_weights(num_positions, embedding_dim)
|
|
126
127
|
|
|
127
128
|
def make_weights(self, num_embeddings: int, embedding_dim: int):
|
|
@@ -403,6 +404,9 @@ class MusicgenMelodyPreTrainedModel(PreTrainedModel):
|
|
|
403
404
|
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
|
|
404
405
|
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
|
|
405
406
|
init.zeros_(module.weight[module.padding_idx])
|
|
407
|
+
elif isinstance(module, MusicgenMelodySinusoidalPositionalEmbedding):
|
|
408
|
+
emb_weights = module.get_embedding(module.num_positions, module.embedding_dim)
|
|
409
|
+
init.copy_(module.weights, emb_weights)
|
|
406
410
|
|
|
407
411
|
|
|
408
412
|
# Copied from transformers.models.musicgen.modeling_musicgen.MusicgenDecoder with MUSICGEN->MUSICGEN_MELODY,Musicgen->MusicgenMelody
|
|
@@ -455,6 +459,7 @@ class MusicgenMelodyDecoder(MusicgenMelodyPreTrainedModel):
|
|
|
455
459
|
output_hidden_states: Optional[bool] = None,
|
|
456
460
|
return_dict: Optional[bool] = None,
|
|
457
461
|
cache_position: Optional[torch.Tensor] = None,
|
|
462
|
+
**kwargs,
|
|
458
463
|
) -> Union[tuple, BaseModelOutputWithPast]:
|
|
459
464
|
r"""
|
|
460
465
|
input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
|
|
@@ -670,6 +675,7 @@ class MusicgenMelodyModel(MusicgenMelodyPreTrainedModel):
|
|
|
670
675
|
output_hidden_states: Optional[bool] = None,
|
|
671
676
|
return_dict: Optional[bool] = None,
|
|
672
677
|
cache_position: Optional[torch.Tensor] = None,
|
|
678
|
+
**kwargs,
|
|
673
679
|
) -> Union[tuple, BaseModelOutputWithPast]:
|
|
674
680
|
r"""
|
|
675
681
|
input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
|
|
@@ -785,6 +791,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel, GenerationMixin):
|
|
|
785
791
|
return_dict: Optional[bool] = None,
|
|
786
792
|
labels: Optional[torch.LongTensor] = None,
|
|
787
793
|
cache_position: Optional[torch.Tensor] = None,
|
|
794
|
+
**kwargs,
|
|
788
795
|
) -> Union[tuple, MusicgenMelodyOutputWithPast]:
|
|
789
796
|
r"""
|
|
790
797
|
input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
|
|
@@ -21,6 +21,7 @@ import torch
|
|
|
21
21
|
from torch import nn
|
|
22
22
|
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
23
23
|
|
|
24
|
+
from ... import initialization as init
|
|
24
25
|
from ...activations import ACT2FN
|
|
25
26
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
26
27
|
from ...generation import GenerationMixin
|
|
@@ -469,6 +470,11 @@ class MvpPreTrainedModel(PreTrainedModel):
|
|
|
469
470
|
base_model_prefix = "model"
|
|
470
471
|
supports_gradient_checkpointing = True
|
|
471
472
|
|
|
473
|
+
def _init_weights(self, module):
|
|
474
|
+
super()._init_weights(module)
|
|
475
|
+
if isinstance(module, MvpForConditionalGeneration):
|
|
476
|
+
init.zeros_(module.final_logits_bias)
|
|
477
|
+
|
|
472
478
|
@property
|
|
473
479
|
def dummy_inputs(self):
|
|
474
480
|
pad_token = self.config.pad_token_id
|
|
@@ -534,6 +540,7 @@ class MvpEncoder(MvpPreTrainedModel):
|
|
|
534
540
|
output_attentions: Optional[bool] = None,
|
|
535
541
|
output_hidden_states: Optional[bool] = None,
|
|
536
542
|
return_dict: Optional[bool] = None,
|
|
543
|
+
**kwargs,
|
|
537
544
|
) -> Union[tuple, BaseModelOutput]:
|
|
538
545
|
r"""
|
|
539
546
|
Args:
|
|
@@ -698,6 +705,7 @@ class MvpDecoder(MvpPreTrainedModel):
|
|
|
698
705
|
output_hidden_states: Optional[bool] = None,
|
|
699
706
|
return_dict: Optional[bool] = None,
|
|
700
707
|
cache_position: Optional[torch.Tensor] = None,
|
|
708
|
+
**kwargs,
|
|
701
709
|
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
|
702
710
|
r"""
|
|
703
711
|
Args:
|
|
@@ -917,6 +925,7 @@ class MvpModel(MvpPreTrainedModel):
|
|
|
917
925
|
output_hidden_states: Optional[bool] = None,
|
|
918
926
|
return_dict: Optional[bool] = None,
|
|
919
927
|
cache_position: Optional[torch.Tensor] = None,
|
|
928
|
+
**kwargs,
|
|
920
929
|
) -> Union[tuple, Seq2SeqModelOutput]:
|
|
921
930
|
r"""
|
|
922
931
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1065,6 +1074,7 @@ class MvpForConditionalGeneration(MvpPreTrainedModel, GenerationMixin):
|
|
|
1065
1074
|
output_hidden_states: Optional[bool] = None,
|
|
1066
1075
|
return_dict: Optional[bool] = None,
|
|
1067
1076
|
cache_position: Optional[torch.Tensor] = None,
|
|
1077
|
+
**kwargs,
|
|
1068
1078
|
) -> Union[tuple, Seq2SeqLMOutput]:
|
|
1069
1079
|
r"""
|
|
1070
1080
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1213,6 +1223,7 @@ class MvpForSequenceClassification(MvpPreTrainedModel):
|
|
|
1213
1223
|
output_attentions: Optional[bool] = None,
|
|
1214
1224
|
output_hidden_states: Optional[bool] = None,
|
|
1215
1225
|
return_dict: Optional[bool] = None,
|
|
1226
|
+
**kwargs,
|
|
1216
1227
|
) -> Union[tuple, Seq2SeqSequenceClassifierOutput]:
|
|
1217
1228
|
r"""
|
|
1218
1229
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1372,6 +1383,7 @@ class MvpForQuestionAnswering(MvpPreTrainedModel):
|
|
|
1372
1383
|
output_attentions: Optional[bool] = None,
|
|
1373
1384
|
output_hidden_states: Optional[bool] = None,
|
|
1374
1385
|
return_dict: Optional[bool] = None,
|
|
1386
|
+
**kwargs,
|
|
1375
1387
|
) -> Union[tuple, Seq2SeqQuestionAnsweringModelOutput]:
|
|
1376
1388
|
r"""
|
|
1377
1389
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1503,6 +1515,7 @@ class MvpDecoderWrapper(MvpPreTrainedModel):
|
|
|
1503
1515
|
def __init__(self, config):
|
|
1504
1516
|
super().__init__(config)
|
|
1505
1517
|
self.decoder = MvpDecoder(config)
|
|
1518
|
+
self.post_init()
|
|
1506
1519
|
|
|
1507
1520
|
def forward(self, *args, **kwargs):
|
|
1508
1521
|
return self.decoder(*args, **kwargs)
|
|
@@ -1548,6 +1561,7 @@ class MvpForCausalLM(MvpPreTrainedModel, GenerationMixin):
|
|
|
1548
1561
|
return_dict: Optional[bool] = None,
|
|
1549
1562
|
cache_position: Optional[torch.Tensor] = None,
|
|
1550
1563
|
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
1564
|
+
**kwargs,
|
|
1551
1565
|
) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
|
|
1552
1566
|
r"""
|
|
1553
1567
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -30,7 +30,7 @@ from ... import initialization as init
|
|
|
30
30
|
from ...activations import ACT2FN
|
|
31
31
|
from ...cache_utils import Cache, DynamicCache
|
|
32
32
|
from ...generation import GenerationMixin
|
|
33
|
-
from ...integrations import use_kernel_func_from_hub
|
|
33
|
+
from ...integrations import use_kernel_func_from_hub, use_kernelized_func
|
|
34
34
|
from ...masking_utils import create_causal_mask
|
|
35
35
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
36
36
|
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
|
@@ -38,7 +38,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
38
38
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
39
39
|
from ...processing_utils import Unpack
|
|
40
40
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
41
|
-
from ...utils.generic import check_model_inputs
|
|
41
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
42
42
|
from .configuration_nanochat import NanoChatConfig
|
|
43
43
|
|
|
44
44
|
|
|
@@ -74,7 +74,7 @@ class NanoChatRotaryEmbedding(nn.Module):
|
|
|
74
74
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
75
75
|
|
|
76
76
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
77
|
-
self.original_inv_freq =
|
|
77
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
78
78
|
|
|
79
79
|
@staticmethod
|
|
80
80
|
def compute_default_rope_parameters(
|
|
@@ -113,7 +113,7 @@ class NanoChatRotaryEmbedding(nn.Module):
|
|
|
113
113
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
114
114
|
|
|
115
115
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
116
|
-
with
|
|
116
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
117
117
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
118
118
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
119
119
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -195,6 +195,7 @@ def rotate_half(x):
|
|
|
195
195
|
return torch.cat((x2, -x1), dim=-1)
|
|
196
196
|
|
|
197
197
|
|
|
198
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
198
199
|
class NanoChatAttention(nn.Module):
|
|
199
200
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
200
201
|
|
|
@@ -220,7 +221,6 @@ class NanoChatAttention(nn.Module):
|
|
|
220
221
|
self.o_proj = nn.Linear(
|
|
221
222
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
222
223
|
)
|
|
223
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
224
224
|
|
|
225
225
|
self.q_norm = NanoChatRMSNorm(eps=config.rms_norm_eps)
|
|
226
226
|
self.k_norm = NanoChatRMSNorm(eps=config.rms_norm_eps)
|
|
@@ -45,6 +45,7 @@ from ...modeling_rope_utils import (
|
|
|
45
45
|
)
|
|
46
46
|
from ...modeling_utils import PreTrainedModel
|
|
47
47
|
from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
|
|
48
|
+
from ...utils.generic import maybe_autocast
|
|
48
49
|
from .configuration_nemotron import NemotronConfig
|
|
49
50
|
|
|
50
51
|
|
|
@@ -87,7 +88,7 @@ class NemotronLayerNorm1P(nn.LayerNorm):
|
|
|
87
88
|
args = _cast_if_autocast_enabled(
|
|
88
89
|
device_type, input, self.normalized_shape, self.weight + 1, self.bias, self.eps
|
|
89
90
|
)
|
|
90
|
-
with
|
|
91
|
+
with maybe_autocast(device_type=input.device.type, enabled=False):
|
|
91
92
|
return F.layer_norm(*args)
|
|
92
93
|
|
|
93
94
|
|
|
@@ -109,7 +110,7 @@ class NemotronRotaryEmbedding(nn.Module):
|
|
|
109
110
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
110
111
|
|
|
111
112
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
112
|
-
self.original_inv_freq =
|
|
113
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
113
114
|
|
|
114
115
|
@staticmethod
|
|
115
116
|
# Ignore copy
|
|
@@ -151,7 +152,7 @@ class NemotronRotaryEmbedding(nn.Module):
|
|
|
151
152
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
152
153
|
|
|
153
154
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
154
|
-
with
|
|
155
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
155
156
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
156
157
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
157
158
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -396,8 +397,8 @@ class NemotronFlashAttention2(NemotronAttention):
|
|
|
396
397
|
else torch.get_autocast_gpu_dtype()
|
|
397
398
|
)
|
|
398
399
|
# Handle the case where the model is quantized
|
|
399
|
-
elif hasattr(self.config, "
|
|
400
|
-
target_dtype = self.config.
|
|
400
|
+
elif hasattr(self.config, "quantization_config"):
|
|
401
|
+
target_dtype = self.config.dtype
|
|
401
402
|
else:
|
|
402
403
|
target_dtype = self.q_proj.weight.dtype
|
|
403
404
|
|
|
@@ -657,6 +658,7 @@ class NemotronModel(NemotronPreTrainedModel):
|
|
|
657
658
|
output_attentions: Optional[bool] = None,
|
|
658
659
|
output_hidden_states: Optional[bool] = None,
|
|
659
660
|
cache_position: Optional[torch.LongTensor] = None,
|
|
661
|
+
**kwargs,
|
|
660
662
|
) -> BaseModelOutputWithPast:
|
|
661
663
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
662
664
|
output_hidden_states = (
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
|
-
from typing import Optional
|
|
16
|
+
from typing import Optional, Union
|
|
17
17
|
|
|
18
18
|
from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
|
|
19
19
|
from tokenizers.models import BPE
|
|
@@ -83,13 +83,15 @@ class NllbTokenizer(TokenizersBackend):
|
|
|
83
83
|
|
|
84
84
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
85
85
|
model_input_names = ["input_ids", "attention_mask"]
|
|
86
|
-
|
|
86
|
+
model = BPE
|
|
87
87
|
|
|
88
88
|
prefix_tokens: list[int] = []
|
|
89
89
|
suffix_tokens: list[int] = []
|
|
90
90
|
|
|
91
91
|
def __init__(
|
|
92
92
|
self,
|
|
93
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
94
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
93
95
|
bos_token="<s>",
|
|
94
96
|
eos_token="</s>",
|
|
95
97
|
sep_token="</s>",
|
|
@@ -101,16 +103,11 @@ class NllbTokenizer(TokenizersBackend):
|
|
|
101
103
|
tgt_lang=None,
|
|
102
104
|
additional_special_tokens=None,
|
|
103
105
|
legacy_behaviour=False,
|
|
104
|
-
vocab=None,
|
|
105
|
-
merges=None,
|
|
106
|
-
vocab_file=None,
|
|
107
106
|
**kwargs,
|
|
108
107
|
):
|
|
109
108
|
if additional_special_tokens is None:
|
|
110
109
|
additional_special_tokens = kwargs.get("extra_special_tokens", FAIRSEQ_LANGUAGE_CODES)
|
|
111
110
|
|
|
112
|
-
self.vocab_file = vocab_file
|
|
113
|
-
|
|
114
111
|
mask_token = (
|
|
115
112
|
AddedToken(mask_token, normalized=True, lstrip=True, special=True)
|
|
116
113
|
if isinstance(mask_token, str)
|
|
@@ -118,23 +115,15 @@ class NllbTokenizer(TokenizersBackend):
|
|
|
118
115
|
)
|
|
119
116
|
self.legacy_behaviour = legacy_behaviour
|
|
120
117
|
|
|
121
|
-
if vocab is
|
|
122
|
-
|
|
123
|
-
self._vocab = {token: idx for idx, (token, _score) in enumerate(vocab)}
|
|
124
|
-
else:
|
|
125
|
-
self._vocab = vocab
|
|
126
|
-
else:
|
|
127
|
-
self._vocab = {
|
|
118
|
+
if vocab is None:
|
|
119
|
+
vocab = {
|
|
128
120
|
str(bos_token): 0,
|
|
129
121
|
str(pad_token): 1,
|
|
130
122
|
str(eos_token): 2,
|
|
131
123
|
str(unk_token): 3,
|
|
132
124
|
}
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
self._merges = []
|
|
136
|
-
else:
|
|
137
|
-
self._merges = merges
|
|
125
|
+
self._vocab = vocab
|
|
126
|
+
self._merges = merges or []
|
|
138
127
|
|
|
139
128
|
self._tokenizer = Tokenizer(
|
|
140
129
|
BPE(
|
|
@@ -158,13 +147,10 @@ class NllbTokenizer(TokenizersBackend):
|
|
|
158
147
|
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)
|
|
159
148
|
self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
|
|
160
149
|
|
|
161
|
-
tokenizer_object = self._tokenizer
|
|
162
|
-
|
|
163
150
|
# Remove extra_special_tokens from kwargs if present to avoid conflict
|
|
164
151
|
kwargs.pop("extra_special_tokens", None)
|
|
165
152
|
|
|
166
153
|
super().__init__(
|
|
167
|
-
tokenizer_object=tokenizer_object,
|
|
168
154
|
bos_token=bos_token,
|
|
169
155
|
eos_token=eos_token,
|
|
170
156
|
sep_token=sep_token,
|
|
@@ -206,6 +206,7 @@ class NllbMoeConfig(PreTrainedConfig):
|
|
|
206
206
|
self.moe_eval_capacity_token_fraction = moe_eval_capacity_token_fraction
|
|
207
207
|
self.moe_token_dropout = moe_token_dropout
|
|
208
208
|
self.output_router_logits = output_router_logits
|
|
209
|
+
|
|
209
210
|
super().__init__(
|
|
210
211
|
pad_token_id=pad_token_id,
|
|
211
212
|
bos_token_id=bos_token_id,
|