transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -868,6 +868,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
|
|
|
868
868
|
attention_mask=None,
|
|
869
869
|
cache_position=None,
|
|
870
870
|
logits_to_keep=None,
|
|
871
|
+
is_first_iteration=False,
|
|
871
872
|
**kwargs,
|
|
872
873
|
):
|
|
873
874
|
# Overwritten -- extra custom processing
|
|
@@ -879,12 +880,15 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
|
|
|
879
880
|
attention_mask=attention_mask,
|
|
880
881
|
cache_position=cache_position,
|
|
881
882
|
logits_to_keep=logits_to_keep,
|
|
883
|
+
is_first_iteration=is_first_iteration,
|
|
882
884
|
**kwargs,
|
|
883
885
|
)
|
|
884
886
|
|
|
885
|
-
#
|
|
886
|
-
#
|
|
887
|
-
|
|
887
|
+
# Pixel values are used only in the first iteration if available
|
|
888
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
889
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
890
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
891
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
888
892
|
model_inputs["pixel_values"] = pixel_values
|
|
889
893
|
model_inputs["pixel_values_videos"] = pixel_values_videos
|
|
890
894
|
model_inputs["image_sizes"] = image_sizes
|
|
@@ -693,6 +693,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
|
|
|
693
693
|
attention_mask=None,
|
|
694
694
|
cache_position=None,
|
|
695
695
|
logits_to_keep=None,
|
|
696
|
+
is_first_iteration=False,
|
|
696
697
|
**kwargs,
|
|
697
698
|
):
|
|
698
699
|
# Overwritten -- extra custom processing
|
|
@@ -704,12 +705,15 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
|
|
|
704
705
|
attention_mask=attention_mask,
|
|
705
706
|
cache_position=cache_position,
|
|
706
707
|
logits_to_keep=logits_to_keep,
|
|
708
|
+
is_first_iteration=is_first_iteration,
|
|
707
709
|
**kwargs,
|
|
708
710
|
)
|
|
709
711
|
|
|
710
|
-
#
|
|
711
|
-
#
|
|
712
|
-
|
|
712
|
+
# Pixel values are used only in the first iteration if available
|
|
713
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
714
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
715
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
716
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
713
717
|
model_inputs["pixel_values"] = pixel_values
|
|
714
718
|
model_inputs["pixel_values_videos"] = pixel_values_videos
|
|
715
719
|
model_inputs["image_sizes"] = image_sizes
|
|
@@ -279,7 +279,6 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
|
|
|
279
279
|
|
|
280
280
|
if do_pad:
|
|
281
281
|
processed_images = self._pad_for_batching(processed_images)
|
|
282
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
283
282
|
return BatchFeature(
|
|
284
283
|
data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
|
|
285
284
|
tensor_type=return_tensors,
|
|
@@ -846,6 +846,7 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene
|
|
|
846
846
|
attention_mask=None,
|
|
847
847
|
cache_position=None,
|
|
848
848
|
logits_to_keep=None,
|
|
849
|
+
is_first_iteration=False,
|
|
849
850
|
**kwargs,
|
|
850
851
|
):
|
|
851
852
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -857,12 +858,15 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene
|
|
|
857
858
|
attention_mask=attention_mask,
|
|
858
859
|
cache_position=cache_position,
|
|
859
860
|
logits_to_keep=logits_to_keep,
|
|
861
|
+
is_first_iteration=is_first_iteration,
|
|
860
862
|
**kwargs,
|
|
861
863
|
)
|
|
862
864
|
|
|
863
|
-
if
|
|
864
|
-
#
|
|
865
|
-
#
|
|
865
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
866
|
+
# Pixel values are used only in the first iteration if available
|
|
867
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
868
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
869
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
866
870
|
model_inputs["pixel_values"] = pixel_values
|
|
867
871
|
model_inputs["image_sizes"] = image_sizes
|
|
868
872
|
model_inputs["pixel_values_videos"] = pixel_values_videos
|
|
@@ -211,7 +211,6 @@ class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast):
|
|
|
211
211
|
|
|
212
212
|
if do_pad:
|
|
213
213
|
processed_images = self._pad_for_batching(processed_images)
|
|
214
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
215
214
|
return BatchFeature(
|
|
216
215
|
data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
|
|
217
216
|
tensor_type=return_tensors,
|
|
@@ -698,6 +697,7 @@ class LlavaOnevisionForConditionalGeneration(LlavaNextVideoForConditionalGenerat
|
|
|
698
697
|
attention_mask=None,
|
|
699
698
|
cache_position=None,
|
|
700
699
|
logits_to_keep=None,
|
|
700
|
+
is_first_iteration=False,
|
|
701
701
|
**kwargs,
|
|
702
702
|
):
|
|
703
703
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -709,12 +709,15 @@ class LlavaOnevisionForConditionalGeneration(LlavaNextVideoForConditionalGenerat
|
|
|
709
709
|
attention_mask=attention_mask,
|
|
710
710
|
cache_position=cache_position,
|
|
711
711
|
logits_to_keep=logits_to_keep,
|
|
712
|
+
is_first_iteration=is_first_iteration,
|
|
712
713
|
**kwargs,
|
|
713
714
|
)
|
|
714
715
|
|
|
715
|
-
if
|
|
716
|
-
#
|
|
717
|
-
#
|
|
716
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
717
|
+
# Pixel values are used only in the first iteration if available
|
|
718
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
719
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
720
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
718
721
|
model_inputs["pixel_values"] = pixel_values
|
|
719
722
|
model_inputs["image_sizes"] = image_sizes
|
|
720
723
|
model_inputs["pixel_values_videos"] = pixel_values_videos
|
|
@@ -40,7 +40,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
40
40
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
41
41
|
from ...processing_utils import Unpack
|
|
42
42
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
43
|
-
from ...utils.generic import check_model_inputs
|
|
43
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
44
44
|
from .configuration_longcat_flash import LongcatFlashConfig
|
|
45
45
|
|
|
46
46
|
|
|
@@ -82,7 +82,7 @@ class LongcatFlashRotaryEmbedding(nn.Module):
|
|
|
82
82
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
83
83
|
|
|
84
84
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
85
|
-
self.original_inv_freq =
|
|
85
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
86
86
|
|
|
87
87
|
@staticmethod
|
|
88
88
|
def compute_default_rope_parameters(
|
|
@@ -121,7 +121,7 @@ class LongcatFlashRotaryEmbedding(nn.Module):
|
|
|
121
121
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
122
122
|
|
|
123
123
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
124
|
-
with
|
|
124
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
125
125
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
126
126
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
127
127
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -431,7 +431,7 @@ class LongcatFlashMLA(nn.Module):
|
|
|
431
431
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
|
432
432
|
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
|
433
433
|
|
|
434
|
-
if self.config._attn_implementation
|
|
434
|
+
if "flash" in self.config._attn_implementation and self.qk_head_dim != self.v_head_dim:
|
|
435
435
|
value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
|
|
436
436
|
|
|
437
437
|
attention_interface: Callable = eager_attention_forward
|
|
@@ -449,7 +449,7 @@ class LongcatFlashMLA(nn.Module):
|
|
|
449
449
|
**kwargs,
|
|
450
450
|
)
|
|
451
451
|
|
|
452
|
-
if self.config._attn_implementation
|
|
452
|
+
if "flash" in self.config._attn_implementation and self.qk_head_dim != self.v_head_dim:
|
|
453
453
|
attn_output = attn_output[:, :, :, : self.v_head_dim]
|
|
454
454
|
|
|
455
455
|
attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
|
|
@@ -563,6 +563,7 @@ class LongcatFlashPreTrainedModel(PreTrainedModel):
|
|
|
563
563
|
super()._init_weights(module)
|
|
564
564
|
if isinstance(module, LongcatFlashTopkRouter):
|
|
565
565
|
init.normal_(module.classifier.weight, mean=0.0, std=self.config.initializer_range)
|
|
566
|
+
init.zeros_(module.e_score_correction_bias)
|
|
566
567
|
if isinstance(module, LongcatFlashExperts):
|
|
567
568
|
if module.gate_up_proj is not None:
|
|
568
569
|
init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
|
|
@@ -215,7 +215,7 @@ class LongcatFlashMLA(DeepseekV3Attention):
|
|
|
215
215
|
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
|
216
216
|
key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
|
217
217
|
|
|
218
|
-
if self.config._attn_implementation
|
|
218
|
+
if "flash" in self.config._attn_implementation and self.qk_head_dim != self.v_head_dim:
|
|
219
219
|
value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
|
|
220
220
|
|
|
221
221
|
attention_interface: Callable = eager_attention_forward
|
|
@@ -233,7 +233,7 @@ class LongcatFlashMLA(DeepseekV3Attention):
|
|
|
233
233
|
**kwargs,
|
|
234
234
|
)
|
|
235
235
|
|
|
236
|
-
if self.config._attn_implementation
|
|
236
|
+
if "flash" in self.config._attn_implementation and self.qk_head_dim != self.v_head_dim:
|
|
237
237
|
attn_output = attn_output[:, :, :, : self.v_head_dim]
|
|
238
238
|
|
|
239
239
|
attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
|
|
@@ -347,6 +347,7 @@ class LongcatFlashPreTrainedModel(PreTrainedModel):
|
|
|
347
347
|
super()._init_weights(module)
|
|
348
348
|
if isinstance(module, LongcatFlashTopkRouter):
|
|
349
349
|
init.normal_(module.classifier.weight, mean=0.0, std=self.config.initializer_range)
|
|
350
|
+
init.zeros_(module.e_score_correction_bias)
|
|
350
351
|
if isinstance(module, LongcatFlashExperts):
|
|
351
352
|
if module.gate_up_proj is not None:
|
|
352
353
|
init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
|
|
@@ -1414,6 +1414,7 @@ class LongformerModel(LongformerPreTrainedModel):
|
|
|
1414
1414
|
output_attentions: Optional[bool] = None,
|
|
1415
1415
|
output_hidden_states: Optional[bool] = None,
|
|
1416
1416
|
return_dict: Optional[bool] = None,
|
|
1417
|
+
**kwargs,
|
|
1417
1418
|
) -> Union[tuple, LongformerBaseModelOutputWithPooling]:
|
|
1418
1419
|
r"""
|
|
1419
1420
|
global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1567,6 +1568,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
|
|
|
1567
1568
|
output_attentions: Optional[bool] = None,
|
|
1568
1569
|
output_hidden_states: Optional[bool] = None,
|
|
1569
1570
|
return_dict: Optional[bool] = None,
|
|
1571
|
+
**kwargs,
|
|
1570
1572
|
) -> Union[tuple, LongformerMaskedLMOutput]:
|
|
1571
1573
|
r"""
|
|
1572
1574
|
global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1678,6 +1680,7 @@ class LongformerForSequenceClassification(LongformerPreTrainedModel):
|
|
|
1678
1680
|
output_attentions: Optional[bool] = None,
|
|
1679
1681
|
output_hidden_states: Optional[bool] = None,
|
|
1680
1682
|
return_dict: Optional[bool] = None,
|
|
1683
|
+
**kwargs,
|
|
1681
1684
|
) -> Union[tuple, LongformerSequenceClassifierOutput]:
|
|
1682
1685
|
r"""
|
|
1683
1686
|
global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1800,6 +1803,7 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel):
|
|
|
1800
1803
|
output_attentions: Optional[bool] = None,
|
|
1801
1804
|
output_hidden_states: Optional[bool] = None,
|
|
1802
1805
|
return_dict: Optional[bool] = None,
|
|
1806
|
+
**kwargs,
|
|
1803
1807
|
) -> Union[tuple, LongformerQuestionAnsweringModelOutput]:
|
|
1804
1808
|
r"""
|
|
1805
1809
|
global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1928,6 +1932,7 @@ class LongformerForTokenClassification(LongformerPreTrainedModel):
|
|
|
1928
1932
|
output_attentions: Optional[bool] = None,
|
|
1929
1933
|
output_hidden_states: Optional[bool] = None,
|
|
1930
1934
|
return_dict: Optional[bool] = None,
|
|
1935
|
+
**kwargs,
|
|
1931
1936
|
) -> Union[tuple, LongformerTokenClassifierOutput]:
|
|
1932
1937
|
r"""
|
|
1933
1938
|
global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -2007,6 +2012,7 @@ class LongformerForMultipleChoice(LongformerPreTrainedModel):
|
|
|
2007
2012
|
output_attentions: Optional[bool] = None,
|
|
2008
2013
|
output_hidden_states: Optional[bool] = None,
|
|
2009
2014
|
return_dict: Optional[bool] = None,
|
|
2015
|
+
**kwargs,
|
|
2010
2016
|
) -> Union[tuple, LongformerMultipleChoiceModelOutput]:
|
|
2011
2017
|
r"""
|
|
2012
2018
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -1283,6 +1283,7 @@ class LongT5Stack(LongT5PreTrainedModel):
|
|
|
1283
1283
|
output_hidden_states=None,
|
|
1284
1284
|
return_dict=None,
|
|
1285
1285
|
cache_position=None,
|
|
1286
|
+
**kwargs,
|
|
1286
1287
|
):
|
|
1287
1288
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
|
1288
1289
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
@@ -1582,12 +1583,10 @@ class LongT5Model(LongT5PreTrainedModel):
|
|
|
1582
1583
|
encoder_config = copy.deepcopy(config)
|
|
1583
1584
|
encoder_config.is_decoder = False
|
|
1584
1585
|
encoder_config.use_cache = False
|
|
1585
|
-
encoder_config.tie_encoder_decoder = False
|
|
1586
1586
|
self.encoder = LongT5Stack(encoder_config)
|
|
1587
1587
|
|
|
1588
1588
|
decoder_config = copy.deepcopy(config)
|
|
1589
1589
|
decoder_config.is_decoder = True
|
|
1590
|
-
decoder_config.tie_encoder_decoder = False
|
|
1591
1590
|
decoder_config.num_layers = config.num_decoder_layers
|
|
1592
1591
|
self.decoder = LongT5Stack(decoder_config)
|
|
1593
1592
|
|
|
@@ -1618,6 +1617,7 @@ class LongT5Model(LongT5PreTrainedModel):
|
|
|
1618
1617
|
output_hidden_states: Optional[bool] = None,
|
|
1619
1618
|
return_dict: Optional[bool] = None,
|
|
1620
1619
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1620
|
+
**kwargs,
|
|
1621
1621
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
|
|
1622
1622
|
r"""
|
|
1623
1623
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1744,12 +1744,10 @@ class LongT5ForConditionalGeneration(LongT5PreTrainedModel, GenerationMixin):
|
|
|
1744
1744
|
encoder_config = copy.deepcopy(config)
|
|
1745
1745
|
encoder_config.is_decoder = False
|
|
1746
1746
|
encoder_config.use_cache = False
|
|
1747
|
-
encoder_config.tie_encoder_decoder = False
|
|
1748
1747
|
self.encoder = LongT5Stack(encoder_config)
|
|
1749
1748
|
|
|
1750
1749
|
decoder_config = copy.deepcopy(config)
|
|
1751
1750
|
decoder_config.is_decoder = True
|
|
1752
|
-
decoder_config.tie_encoder_decoder = False
|
|
1753
1751
|
decoder_config.num_layers = config.num_decoder_layers
|
|
1754
1752
|
self.decoder = LongT5Stack(decoder_config)
|
|
1755
1753
|
|
|
@@ -1783,6 +1781,7 @@ class LongT5ForConditionalGeneration(LongT5PreTrainedModel, GenerationMixin):
|
|
|
1783
1781
|
output_hidden_states: Optional[bool] = None,
|
|
1784
1782
|
return_dict: Optional[bool] = None,
|
|
1785
1783
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1784
|
+
**kwargs,
|
|
1786
1785
|
) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
|
|
1787
1786
|
r"""
|
|
1788
1787
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1946,6 +1945,7 @@ class LongT5EncoderModel(LongT5PreTrainedModel):
|
|
|
1946
1945
|
output_attentions: Optional[bool] = None,
|
|
1947
1946
|
output_hidden_states: Optional[bool] = None,
|
|
1948
1947
|
return_dict: Optional[bool] = None,
|
|
1948
|
+
**kwargs,
|
|
1949
1949
|
) -> Union[tuple[torch.FloatTensor], BaseModelOutput]:
|
|
1950
1950
|
r"""
|
|
1951
1951
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -837,6 +837,7 @@ class LukeModel(LukePreTrainedModel):
|
|
|
837
837
|
output_attentions: Optional[bool] = None,
|
|
838
838
|
output_hidden_states: Optional[bool] = None,
|
|
839
839
|
return_dict: Optional[bool] = None,
|
|
840
|
+
**kwargs,
|
|
840
841
|
) -> Union[tuple, BaseLukeModelOutputWithPooling]:
|
|
841
842
|
r"""
|
|
842
843
|
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
|
|
@@ -1087,6 +1088,7 @@ class LukeForMaskedLM(LukePreTrainedModel):
|
|
|
1087
1088
|
output_attentions: Optional[bool] = None,
|
|
1088
1089
|
output_hidden_states: Optional[bool] = None,
|
|
1089
1090
|
return_dict: Optional[bool] = None,
|
|
1091
|
+
**kwargs,
|
|
1090
1092
|
) -> Union[tuple, LukeMaskedLMOutput]:
|
|
1091
1093
|
r"""
|
|
1092
1094
|
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
|
|
@@ -1220,6 +1222,7 @@ class LukeForEntityClassification(LukePreTrainedModel):
|
|
|
1220
1222
|
output_attentions: Optional[bool] = None,
|
|
1221
1223
|
output_hidden_states: Optional[bool] = None,
|
|
1222
1224
|
return_dict: Optional[bool] = None,
|
|
1225
|
+
**kwargs,
|
|
1223
1226
|
) -> Union[tuple, EntityClassificationOutput]:
|
|
1224
1227
|
r"""
|
|
1225
1228
|
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
|
|
@@ -1348,6 +1351,7 @@ class LukeForEntityPairClassification(LukePreTrainedModel):
|
|
|
1348
1351
|
output_attentions: Optional[bool] = None,
|
|
1349
1352
|
output_hidden_states: Optional[bool] = None,
|
|
1350
1353
|
return_dict: Optional[bool] = None,
|
|
1354
|
+
**kwargs,
|
|
1351
1355
|
) -> Union[tuple, EntityPairClassificationOutput]:
|
|
1352
1356
|
r"""
|
|
1353
1357
|
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
|
|
@@ -1483,6 +1487,7 @@ class LukeForEntitySpanClassification(LukePreTrainedModel):
|
|
|
1483
1487
|
output_attentions: Optional[bool] = None,
|
|
1484
1488
|
output_hidden_states: Optional[bool] = None,
|
|
1485
1489
|
return_dict: Optional[bool] = None,
|
|
1490
|
+
**kwargs,
|
|
1486
1491
|
) -> Union[tuple, EntitySpanClassificationOutput]:
|
|
1487
1492
|
r"""
|
|
1488
1493
|
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
|
|
@@ -1638,6 +1643,7 @@ class LukeForSequenceClassification(LukePreTrainedModel):
|
|
|
1638
1643
|
output_attentions: Optional[bool] = None,
|
|
1639
1644
|
output_hidden_states: Optional[bool] = None,
|
|
1640
1645
|
return_dict: Optional[bool] = None,
|
|
1646
|
+
**kwargs,
|
|
1641
1647
|
) -> Union[tuple, LukeSequenceClassifierOutput]:
|
|
1642
1648
|
r"""
|
|
1643
1649
|
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
|
|
@@ -1764,6 +1770,7 @@ class LukeForTokenClassification(LukePreTrainedModel):
|
|
|
1764
1770
|
output_attentions: Optional[bool] = None,
|
|
1765
1771
|
output_hidden_states: Optional[bool] = None,
|
|
1766
1772
|
return_dict: Optional[bool] = None,
|
|
1773
|
+
**kwargs,
|
|
1767
1774
|
) -> Union[tuple, LukeTokenClassifierOutput]:
|
|
1768
1775
|
r"""
|
|
1769
1776
|
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
|
|
@@ -1865,6 +1872,7 @@ class LukeForQuestionAnswering(LukePreTrainedModel):
|
|
|
1865
1872
|
output_attentions: Optional[bool] = None,
|
|
1866
1873
|
output_hidden_states: Optional[bool] = None,
|
|
1867
1874
|
return_dict: Optional[bool] = None,
|
|
1875
|
+
**kwargs,
|
|
1868
1876
|
) -> Union[tuple, LukeQuestionAnsweringModelOutput]:
|
|
1869
1877
|
r"""
|
|
1870
1878
|
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
|
|
@@ -1982,6 +1990,7 @@ class LukeForMultipleChoice(LukePreTrainedModel):
|
|
|
1982
1990
|
output_attentions: Optional[bool] = None,
|
|
1983
1991
|
output_hidden_states: Optional[bool] = None,
|
|
1984
1992
|
return_dict: Optional[bool] = None,
|
|
1993
|
+
**kwargs,
|
|
1985
1994
|
) -> Union[tuple, LukeMultipleChoiceModelOutput]:
|
|
1986
1995
|
r"""
|
|
1987
1996
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -20,7 +20,7 @@ from collections.abc import Mapping
|
|
|
20
20
|
from typing import Optional, Union
|
|
21
21
|
|
|
22
22
|
import numpy as np
|
|
23
|
-
from tokenizers import Tokenizer, decoders, pre_tokenizers
|
|
23
|
+
from tokenizers import Tokenizer, decoders, pre_tokenizers
|
|
24
24
|
from tokenizers.models import BPE
|
|
25
25
|
|
|
26
26
|
from ...tokenization_python import PreTrainedTokenizer
|
|
@@ -167,6 +167,10 @@ class LukeTokenizer(TokenizersBackend):
|
|
|
167
167
|
Path to the vocabulary file.
|
|
168
168
|
merges_file (`str`):
|
|
169
169
|
Path to the merges file.
|
|
170
|
+
vocab (`str` or `dict[str, int]`, *optional*):
|
|
171
|
+
Custom vocabulary dictionary. If not provided, the vocabulary is loaded from `vocab_file`.
|
|
172
|
+
merges (`str` or `list[str]`, *optional*):
|
|
173
|
+
Custom merges list. If not provided, merges are loaded from `merges_file`.
|
|
170
174
|
entity_vocab_file (`str`):
|
|
171
175
|
Path to the entity vocabulary file.
|
|
172
176
|
task (`str`, *optional*):
|
|
@@ -228,10 +232,13 @@ class LukeTokenizer(TokenizersBackend):
|
|
|
228
232
|
|
|
229
233
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
230
234
|
model_input_names = ["input_ids", "attention_mask"]
|
|
231
|
-
|
|
235
|
+
model = BPE
|
|
232
236
|
|
|
233
237
|
def __init__(
|
|
234
238
|
self,
|
|
239
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
240
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
241
|
+
entity_vocab: Optional[Union[str, dict, list]] = None,
|
|
235
242
|
errors="replace",
|
|
236
243
|
bos_token="<s>",
|
|
237
244
|
eos_token="</s>",
|
|
@@ -250,37 +257,17 @@ class LukeTokenizer(TokenizersBackend):
|
|
|
250
257
|
entity_pad_token="[PAD]",
|
|
251
258
|
entity_mask_token="[MASK]",
|
|
252
259
|
entity_mask2_token="[MASK2]",
|
|
253
|
-
vocab: Optional[dict] = None,
|
|
254
|
-
merges: Optional[list] = None,
|
|
255
|
-
entity_vocab: Optional[dict] = None,
|
|
256
260
|
**kwargs,
|
|
257
261
|
):
|
|
258
262
|
self.add_prefix_space = add_prefix_space
|
|
259
263
|
|
|
260
264
|
# Handle entity vocab file for backward compatibility
|
|
261
265
|
entity_vocab_file = kwargs.pop("entity_vocab_file", None)
|
|
262
|
-
|
|
263
|
-
# Check if vocab/merges/entity_vocab are in kwargs
|
|
264
|
-
if vocab is None and "vocab" in kwargs:
|
|
265
|
-
vocab = kwargs.pop("vocab")
|
|
266
|
-
if merges is None and "merges" in kwargs:
|
|
267
|
-
merges = kwargs.pop("merges")
|
|
268
266
|
if entity_vocab is None and "entity_vocab" in kwargs:
|
|
269
267
|
entity_vocab = kwargs.pop("entity_vocab")
|
|
270
268
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
self._vocab = (
|
|
274
|
-
{token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
|
|
275
|
-
)
|
|
276
|
-
else:
|
|
277
|
-
self._vocab = {}
|
|
278
|
-
|
|
279
|
-
if merges is not None:
|
|
280
|
-
self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
|
|
281
|
-
else:
|
|
282
|
-
self._merges = []
|
|
283
|
-
|
|
269
|
+
self._vocab = vocab or {}
|
|
270
|
+
self._merges = merges or []
|
|
284
271
|
self._tokenizer = Tokenizer(
|
|
285
272
|
BPE(
|
|
286
273
|
vocab=self._vocab,
|
|
@@ -365,8 +352,6 @@ class LukeTokenizer(TokenizersBackend):
|
|
|
365
352
|
|
|
366
353
|
kwargs["extra_special_tokens"] = extra_tokens
|
|
367
354
|
|
|
368
|
-
tokenizer_object = self._tokenizer
|
|
369
|
-
|
|
370
355
|
# Configure default special token behaviors to match LUKE formatting
|
|
371
356
|
token_type_ids_pattern = kwargs.setdefault("token_type_ids_pattern", "all_zeros")
|
|
372
357
|
special_tokens_pattern = kwargs.setdefault("special_tokens_pattern", "cls_double_sep")
|
|
@@ -379,7 +364,6 @@ class LukeTokenizer(TokenizersBackend):
|
|
|
379
364
|
kwargs.setdefault("clean_up_tokenization_spaces", True)
|
|
380
365
|
|
|
381
366
|
super().__init__(
|
|
382
|
-
tokenizer_object=tokenizer_object,
|
|
383
367
|
errors=errors,
|
|
384
368
|
bos_token=bos_token,
|
|
385
369
|
eos_token=eos_token,
|
|
@@ -401,17 +385,6 @@ class LukeTokenizer(TokenizersBackend):
|
|
|
401
385
|
entity_vocab=entity_vocab if entity_vocab_file is None else None, # Only store if it was passed as data
|
|
402
386
|
**kwargs,
|
|
403
387
|
)
|
|
404
|
-
self._post_init()
|
|
405
|
-
|
|
406
|
-
def _post_init(self):
|
|
407
|
-
self._tokenizer.post_processor = processors.TemplateProcessing(
|
|
408
|
-
single=f"{self.cls_token}:0 $A:0 {self.sep_token}:0",
|
|
409
|
-
pair=f"{self.cls_token}:0 $A:0 {self.sep_token}:0 {self.sep_token}:0 $B:1 {self.sep_token}:1",
|
|
410
|
-
special_tokens=[
|
|
411
|
-
(self.cls_token, self.cls_token_id),
|
|
412
|
-
(self.sep_token, self.sep_token_id),
|
|
413
|
-
],
|
|
414
|
-
)
|
|
415
388
|
|
|
416
389
|
def build_inputs_with_special_tokens(
|
|
417
390
|
self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
|
|
@@ -711,6 +711,7 @@ class LxmertModel(LxmertPreTrainedModel):
|
|
|
711
711
|
output_attentions: Optional[bool] = None,
|
|
712
712
|
output_hidden_states: Optional[bool] = None,
|
|
713
713
|
return_dict: Optional[bool] = None,
|
|
714
|
+
**kwargs,
|
|
714
715
|
) -> Union[LxmertModelOutput, tuple[torch.FloatTensor]]:
|
|
715
716
|
r"""
|
|
716
717
|
visual_feats (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
|
|
@@ -1244,6 +1245,7 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
|
|
|
1244
1245
|
output_attentions: Optional[bool] = None,
|
|
1245
1246
|
output_hidden_states: Optional[bool] = None,
|
|
1246
1247
|
return_dict: Optional[bool] = None,
|
|
1248
|
+
**kwargs,
|
|
1247
1249
|
) -> Union[LxmertForQuestionAnsweringOutput, tuple[torch.FloatTensor]]:
|
|
1248
1250
|
r"""
|
|
1249
1251
|
visual_feats (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
|
|
@@ -22,6 +22,7 @@ import torch
|
|
|
22
22
|
from torch import nn
|
|
23
23
|
from torch.nn import CrossEntropyLoss
|
|
24
24
|
|
|
25
|
+
from ... import initialization as init
|
|
25
26
|
from ...activations import ACT2FN
|
|
26
27
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
27
28
|
from ...generation import GenerationMixin
|
|
@@ -84,6 +85,7 @@ class M2M100SinusoidalPositionalEmbedding(nn.Module):
|
|
|
84
85
|
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
|
|
85
86
|
super().__init__()
|
|
86
87
|
self.offset = 2
|
|
88
|
+
self.num_positions = num_positions
|
|
87
89
|
self.embedding_dim = embedding_dim
|
|
88
90
|
self.padding_idx = padding_idx
|
|
89
91
|
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
|
|
@@ -515,6 +517,14 @@ class M2M100PreTrainedModel(PreTrainedModel):
|
|
|
515
517
|
# Doesn't support `compile` (dynamic control flow). Can be fixed but low usage model
|
|
516
518
|
_can_compile_fullgraph = False
|
|
517
519
|
|
|
520
|
+
def _init_weights(self, module):
|
|
521
|
+
super()._init_weights(module)
|
|
522
|
+
if isinstance(module, M2M100SinusoidalPositionalEmbedding):
|
|
523
|
+
emb_weights = module.get_embedding(
|
|
524
|
+
module.num_positions + module.offset, module.embedding_dim, module.padding_idx
|
|
525
|
+
)
|
|
526
|
+
init.copy_(module.weights, emb_weights)
|
|
527
|
+
|
|
518
528
|
|
|
519
529
|
class M2M100Encoder(M2M100PreTrainedModel):
|
|
520
530
|
"""
|
|
@@ -561,6 +571,7 @@ class M2M100Encoder(M2M100PreTrainedModel):
|
|
|
561
571
|
output_attentions: Optional[bool] = None,
|
|
562
572
|
output_hidden_states: Optional[bool] = None,
|
|
563
573
|
return_dict: Optional[bool] = None,
|
|
574
|
+
**kwargs,
|
|
564
575
|
):
|
|
565
576
|
r"""
|
|
566
577
|
Args:
|
|
@@ -713,6 +724,7 @@ class M2M100Decoder(M2M100PreTrainedModel):
|
|
|
713
724
|
output_hidden_states: Optional[bool] = None,
|
|
714
725
|
return_dict: Optional[bool] = None,
|
|
715
726
|
cache_position: Optional[torch.Tensor] = None,
|
|
727
|
+
**kwargs,
|
|
716
728
|
):
|
|
717
729
|
r"""
|
|
718
730
|
Args:
|
|
@@ -941,6 +953,7 @@ class M2M100Model(M2M100PreTrainedModel):
|
|
|
941
953
|
output_hidden_states: Optional[bool] = None,
|
|
942
954
|
return_dict: Optional[bool] = None,
|
|
943
955
|
cache_position: Optional[torch.Tensor] = None,
|
|
956
|
+
**kwargs,
|
|
944
957
|
) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
|
|
945
958
|
r"""
|
|
946
959
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -1046,6 +1059,7 @@ class M2M100ForConditionalGeneration(M2M100PreTrainedModel, GenerationMixin):
|
|
|
1046
1059
|
output_hidden_states: Optional[bool] = None,
|
|
1047
1060
|
return_dict: Optional[bool] = None,
|
|
1048
1061
|
cache_position: Optional[torch.Tensor] = None,
|
|
1062
|
+
**kwargs,
|
|
1049
1063
|
) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
|
|
1050
1064
|
r"""
|
|
1051
1065
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|