transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -26,6 +26,7 @@ from typing import Any, Optional, Union
|
|
|
26
26
|
import torch.nn as nn
|
|
27
27
|
import torch.nn.functional as F
|
|
28
28
|
|
|
29
|
+
from ... import initialization as init
|
|
29
30
|
from ...activations import ACT2FN
|
|
30
31
|
from ...cache_utils import Cache
|
|
31
32
|
from ...generation import GenerationMixin
|
|
@@ -541,7 +542,7 @@ class Florence2VisionBackbone(Florence2VisionPreTrainedModel):
|
|
|
541
542
|
# Initialize weights and apply final processing
|
|
542
543
|
self.post_init()
|
|
543
544
|
|
|
544
|
-
def forward(self, hidden_states: torch.Tensor):
|
|
545
|
+
def forward(self, hidden_states: torch.Tensor, **kwargs):
|
|
545
546
|
for conv, block in zip(self.convs, self.blocks):
|
|
546
547
|
hidden_states = conv(hidden_states)
|
|
547
548
|
for layer in block:
|
|
@@ -629,6 +630,18 @@ class Florence2PreTrainedModel(PreTrainedModel):
|
|
|
629
630
|
_supports_attention_backend = False
|
|
630
631
|
config_class = Florence2Config
|
|
631
632
|
|
|
633
|
+
def _init_weights(self, module):
|
|
634
|
+
super()._init_weights(module)
|
|
635
|
+
if isinstance(module, Florence2VisionPositionalEmbeddingCosine1D):
|
|
636
|
+
pos_idx_to_embed = torch.empty((module.max_seq_len, module.embed_dim))
|
|
637
|
+
sine, cosine = module.get_sinusoid_embeddings(
|
|
638
|
+
max_positions=module.max_seq_len,
|
|
639
|
+
embed_dim=module.embed_dim,
|
|
640
|
+
)
|
|
641
|
+
pos_idx_to_embed[:, 0::2] = sine
|
|
642
|
+
pos_idx_to_embed[:, 1::2] = cosine
|
|
643
|
+
init.copy_(module.pos_idx_to_embed, pos_idx_to_embed)
|
|
644
|
+
|
|
632
645
|
|
|
633
646
|
@auto_docstring(
|
|
634
647
|
custom_intro="""
|
|
@@ -708,6 +721,7 @@ class Florence2Model(Florence2PreTrainedModel):
|
|
|
708
721
|
output_hidden_states: Optional[bool] = None,
|
|
709
722
|
return_dict: Optional[bool] = None,
|
|
710
723
|
cache_position: Optional[torch.LongTensor] = None,
|
|
724
|
+
**kwargs,
|
|
711
725
|
) -> Union[tuple, Florence2Seq2SeqModelOutput]:
|
|
712
726
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
713
727
|
output_hidden_states = (
|
|
@@ -936,6 +950,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
|
|
|
936
950
|
attention_mask=None,
|
|
937
951
|
cache_position=None,
|
|
938
952
|
logits_to_keep=None,
|
|
953
|
+
is_first_iteration=False,
|
|
939
954
|
**kwargs,
|
|
940
955
|
):
|
|
941
956
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -947,12 +962,15 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
|
|
|
947
962
|
attention_mask=attention_mask,
|
|
948
963
|
cache_position=cache_position,
|
|
949
964
|
logits_to_keep=logits_to_keep,
|
|
965
|
+
is_first_iteration=is_first_iteration,
|
|
950
966
|
**kwargs,
|
|
951
967
|
)
|
|
952
968
|
|
|
953
|
-
if
|
|
954
|
-
#
|
|
955
|
-
#
|
|
969
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
970
|
+
# Pixel values are used only in the first iteration if available
|
|
971
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
972
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
973
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
956
974
|
model_inputs["pixel_values"] = pixel_values
|
|
957
975
|
|
|
958
976
|
return model_inputs
|
|
@@ -22,6 +22,7 @@ import numpy as np
|
|
|
22
22
|
import torch.nn as nn
|
|
23
23
|
import torch.nn.functional as F
|
|
24
24
|
|
|
25
|
+
from ... import initialization as init
|
|
25
26
|
from ...activations import ACT2FN
|
|
26
27
|
from ...cache_utils import Cache
|
|
27
28
|
from ...configuration_utils import PreTrainedConfig
|
|
@@ -1422,7 +1423,7 @@ class Florence2VisionBackbone(Florence2VisionPreTrainedModel):
|
|
|
1422
1423
|
# Initialize weights and apply final processing
|
|
1423
1424
|
self.post_init()
|
|
1424
1425
|
|
|
1425
|
-
def forward(self, hidden_states: torch.Tensor):
|
|
1426
|
+
def forward(self, hidden_states: torch.Tensor, **kwargs):
|
|
1426
1427
|
for conv, block in zip(self.convs, self.blocks):
|
|
1427
1428
|
hidden_states = conv(hidden_states)
|
|
1428
1429
|
for layer in block:
|
|
@@ -1500,6 +1501,18 @@ class Florence2PreTrainedModel(LlavaPreTrainedModel):
|
|
|
1500
1501
|
|
|
1501
1502
|
_supports_attention_backend = False
|
|
1502
1503
|
|
|
1504
|
+
def _init_weights(self, module):
|
|
1505
|
+
PreTrainedModel._init_weights(self, module)
|
|
1506
|
+
if isinstance(module, Florence2VisionPositionalEmbeddingCosine1D):
|
|
1507
|
+
pos_idx_to_embed = torch.empty((module.max_seq_len, module.embed_dim))
|
|
1508
|
+
sine, cosine = module.get_sinusoid_embeddings(
|
|
1509
|
+
max_positions=module.max_seq_len,
|
|
1510
|
+
embed_dim=module.embed_dim,
|
|
1511
|
+
)
|
|
1512
|
+
pos_idx_to_embed[:, 0::2] = sine
|
|
1513
|
+
pos_idx_to_embed[:, 1::2] = cosine
|
|
1514
|
+
init.copy_(module.pos_idx_to_embed, pos_idx_to_embed)
|
|
1515
|
+
|
|
1503
1516
|
|
|
1504
1517
|
@auto_docstring(
|
|
1505
1518
|
custom_intro="""
|
|
@@ -1551,6 +1564,7 @@ class Florence2Model(LlavaModel):
|
|
|
1551
1564
|
output_hidden_states: Optional[bool] = None,
|
|
1552
1565
|
return_dict: Optional[bool] = None,
|
|
1553
1566
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1567
|
+
**kwargs,
|
|
1554
1568
|
) -> Union[tuple, Florence2Seq2SeqModelOutput]:
|
|
1555
1569
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
1556
1570
|
output_hidden_states = (
|
|
@@ -23,6 +23,7 @@ import torch
|
|
|
23
23
|
from torch import nn
|
|
24
24
|
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
25
25
|
|
|
26
|
+
from ... import initialization as init
|
|
26
27
|
from ...utils import auto_docstring, is_scipy_available
|
|
27
28
|
|
|
28
29
|
|
|
@@ -374,6 +375,12 @@ class FNetPreTrainedModel(PreTrainedModel):
|
|
|
374
375
|
base_model_prefix = "fnet"
|
|
375
376
|
supports_gradient_checkpointing = True
|
|
376
377
|
|
|
378
|
+
def _init_weights(self, module):
|
|
379
|
+
super()._init_weights(module)
|
|
380
|
+
if isinstance(module, FNetEmbeddings):
|
|
381
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
382
|
+
init.zeros_(module.token_type_ids)
|
|
383
|
+
|
|
377
384
|
|
|
378
385
|
@dataclass
|
|
379
386
|
@auto_docstring(
|
|
@@ -439,6 +446,7 @@ class FNetModel(FNetPreTrainedModel):
|
|
|
439
446
|
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
440
447
|
output_hidden_states: Optional[bool] = None,
|
|
441
448
|
return_dict: Optional[bool] = None,
|
|
449
|
+
**kwargs,
|
|
442
450
|
) -> Union[tuple, BaseModelOutput]:
|
|
443
451
|
output_hidden_states = (
|
|
444
452
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
@@ -540,6 +548,7 @@ class FNetForPreTraining(FNetPreTrainedModel):
|
|
|
540
548
|
next_sentence_label: Optional[torch.Tensor] = None,
|
|
541
549
|
output_hidden_states: Optional[bool] = None,
|
|
542
550
|
return_dict: Optional[bool] = None,
|
|
551
|
+
**kwargs,
|
|
543
552
|
) -> Union[tuple, FNetForPreTrainingOutput]:
|
|
544
553
|
r"""
|
|
545
554
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -632,6 +641,7 @@ class FNetForMaskedLM(FNetPreTrainedModel):
|
|
|
632
641
|
labels: Optional[torch.Tensor] = None,
|
|
633
642
|
output_hidden_states: Optional[bool] = None,
|
|
634
643
|
return_dict: Optional[bool] = None,
|
|
644
|
+
**kwargs,
|
|
635
645
|
) -> Union[tuple, MaskedLMOutput]:
|
|
636
646
|
r"""
|
|
637
647
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -783,6 +793,7 @@ class FNetForSequenceClassification(FNetPreTrainedModel):
|
|
|
783
793
|
labels: Optional[torch.Tensor] = None,
|
|
784
794
|
output_hidden_states: Optional[bool] = None,
|
|
785
795
|
return_dict: Optional[bool] = None,
|
|
796
|
+
**kwargs,
|
|
786
797
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
787
798
|
r"""
|
|
788
799
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -856,6 +867,7 @@ class FNetForMultipleChoice(FNetPreTrainedModel):
|
|
|
856
867
|
labels: Optional[torch.Tensor] = None,
|
|
857
868
|
output_hidden_states: Optional[bool] = None,
|
|
858
869
|
return_dict: Optional[bool] = None,
|
|
870
|
+
**kwargs,
|
|
859
871
|
) -> Union[tuple, MultipleChoiceModelOutput]:
|
|
860
872
|
r"""
|
|
861
873
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -950,6 +962,7 @@ class FNetForTokenClassification(FNetPreTrainedModel):
|
|
|
950
962
|
labels: Optional[torch.Tensor] = None,
|
|
951
963
|
output_hidden_states: Optional[bool] = None,
|
|
952
964
|
return_dict: Optional[bool] = None,
|
|
965
|
+
**kwargs,
|
|
953
966
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
954
967
|
r"""
|
|
955
968
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1008,6 +1021,7 @@ class FNetForQuestionAnswering(FNetPreTrainedModel):
|
|
|
1008
1021
|
end_positions: Optional[torch.Tensor] = None,
|
|
1009
1022
|
output_hidden_states: Optional[bool] = None,
|
|
1010
1023
|
return_dict: Optional[bool] = None,
|
|
1024
|
+
**kwargs,
|
|
1011
1025
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
1012
1026
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
1013
1027
|
|
|
@@ -628,6 +628,7 @@ class FocalNetModel(FocalNetPreTrainedModel):
|
|
|
628
628
|
bool_masked_pos: Optional[torch.BoolTensor] = None,
|
|
629
629
|
output_hidden_states: Optional[bool] = None,
|
|
630
630
|
return_dict: Optional[bool] = None,
|
|
631
|
+
**kwargs,
|
|
631
632
|
) -> Union[tuple, FocalNetModelOutput]:
|
|
632
633
|
r"""
|
|
633
634
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
|
|
@@ -710,6 +711,7 @@ class FocalNetForMaskedImageModeling(FocalNetPreTrainedModel):
|
|
|
710
711
|
bool_masked_pos: Optional[torch.BoolTensor] = None,
|
|
711
712
|
output_hidden_states: Optional[bool] = None,
|
|
712
713
|
return_dict: Optional[bool] = None,
|
|
714
|
+
**kwargs,
|
|
713
715
|
) -> Union[tuple, FocalNetMaskedImageModelingOutput]:
|
|
714
716
|
r"""
|
|
715
717
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
|
|
@@ -812,6 +814,7 @@ class FocalNetForImageClassification(FocalNetPreTrainedModel):
|
|
|
812
814
|
labels: Optional[torch.LongTensor] = None,
|
|
813
815
|
output_hidden_states: Optional[bool] = None,
|
|
814
816
|
return_dict: Optional[bool] = None,
|
|
817
|
+
**kwargs,
|
|
815
818
|
) -> Union[tuple, FocalNetImageClassifierOutput]:
|
|
816
819
|
r"""
|
|
817
820
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -871,6 +874,7 @@ class FocalNetBackbone(FocalNetPreTrainedModel, BackboneMixin):
|
|
|
871
874
|
pixel_values: torch.Tensor,
|
|
872
875
|
output_hidden_states: Optional[bool] = None,
|
|
873
876
|
return_dict: Optional[bool] = None,
|
|
877
|
+
**kwargs,
|
|
874
878
|
) -> BackboneOutput:
|
|
875
879
|
r"""
|
|
876
880
|
Examples:
|
|
@@ -843,6 +843,7 @@ class FSMTModel(PretrainedFSMTModel):
|
|
|
843
843
|
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
844
844
|
return_dict: Optional[bool] = None,
|
|
845
845
|
cache_position: Optional[torch.Tensor] = None,
|
|
846
|
+
**kwargs,
|
|
846
847
|
) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
|
|
847
848
|
r"""
|
|
848
849
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -980,6 +981,7 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel, GenerationMixin):
|
|
|
980
981
|
output_hidden_states: Optional[bool] = None,
|
|
981
982
|
return_dict: Optional[bool] = None,
|
|
982
983
|
cache_position: Optional[torch.Tensor] = None,
|
|
984
|
+
**kwargs,
|
|
983
985
|
) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
|
|
984
986
|
r"""
|
|
985
987
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -766,6 +766,7 @@ class FunnelBaseModel(FunnelPreTrainedModel):
|
|
|
766
766
|
output_attentions: Optional[bool] = None,
|
|
767
767
|
output_hidden_states: Optional[bool] = None,
|
|
768
768
|
return_dict: Optional[bool] = None,
|
|
769
|
+
**kwargs,
|
|
769
770
|
) -> Union[tuple, BaseModelOutput]:
|
|
770
771
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
771
772
|
output_hidden_states = (
|
|
@@ -832,6 +833,7 @@ class FunnelModel(FunnelPreTrainedModel):
|
|
|
832
833
|
output_attentions: Optional[bool] = None,
|
|
833
834
|
output_hidden_states: Optional[bool] = None,
|
|
834
835
|
return_dict: Optional[bool] = None,
|
|
836
|
+
**kwargs,
|
|
835
837
|
) -> Union[tuple, BaseModelOutput]:
|
|
836
838
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
837
839
|
output_hidden_states = (
|
|
@@ -923,6 +925,7 @@ class FunnelForPreTraining(FunnelPreTrainedModel):
|
|
|
923
925
|
output_attentions: Optional[bool] = None,
|
|
924
926
|
output_hidden_states: Optional[bool] = None,
|
|
925
927
|
return_dict: Optional[bool] = None,
|
|
928
|
+
**kwargs,
|
|
926
929
|
) -> Union[tuple, FunnelForPreTrainingOutput]:
|
|
927
930
|
r"""
|
|
928
931
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1012,6 +1015,7 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
|
|
|
1012
1015
|
output_attentions: Optional[bool] = None,
|
|
1013
1016
|
output_hidden_states: Optional[bool] = None,
|
|
1014
1017
|
return_dict: Optional[bool] = None,
|
|
1018
|
+
**kwargs,
|
|
1015
1019
|
) -> Union[tuple, MaskedLMOutput]:
|
|
1016
1020
|
r"""
|
|
1017
1021
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1079,6 +1083,7 @@ class FunnelForSequenceClassification(FunnelPreTrainedModel):
|
|
|
1079
1083
|
output_attentions: Optional[bool] = None,
|
|
1080
1084
|
output_hidden_states: Optional[bool] = None,
|
|
1081
1085
|
return_dict: Optional[bool] = None,
|
|
1086
|
+
**kwargs,
|
|
1082
1087
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1083
1088
|
r"""
|
|
1084
1089
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1158,6 +1163,7 @@ class FunnelForMultipleChoice(FunnelPreTrainedModel):
|
|
|
1158
1163
|
output_attentions: Optional[bool] = None,
|
|
1159
1164
|
output_hidden_states: Optional[bool] = None,
|
|
1160
1165
|
return_dict: Optional[bool] = None,
|
|
1166
|
+
**kwargs,
|
|
1161
1167
|
) -> Union[tuple, MultipleChoiceModelOutput]:
|
|
1162
1168
|
r"""
|
|
1163
1169
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1233,6 +1239,7 @@ class FunnelForTokenClassification(FunnelPreTrainedModel):
|
|
|
1233
1239
|
output_attentions: Optional[bool] = None,
|
|
1234
1240
|
output_hidden_states: Optional[bool] = None,
|
|
1235
1241
|
return_dict: Optional[bool] = None,
|
|
1242
|
+
**kwargs,
|
|
1236
1243
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1237
1244
|
r"""
|
|
1238
1245
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1295,6 +1302,7 @@ class FunnelForQuestionAnswering(FunnelPreTrainedModel):
|
|
|
1295
1302
|
output_attentions: Optional[bool] = None,
|
|
1296
1303
|
output_hidden_states: Optional[bool] = None,
|
|
1297
1304
|
return_dict: Optional[bool] = None,
|
|
1305
|
+
**kwargs,
|
|
1298
1306
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
1299
1307
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
1300
1308
|
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""Tokenization class for Funnel Transformer."""
|
|
16
16
|
|
|
17
|
-
from typing import Optional
|
|
17
|
+
from typing import Optional, Union
|
|
18
18
|
|
|
19
19
|
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
|
|
20
20
|
from tokenizers.models import WordPiece
|
|
@@ -83,16 +83,17 @@ class FunnelTokenizer(TokenizersBackend):
|
|
|
83
83
|
value for `lowercase` (as in the original BERT).
|
|
84
84
|
wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
|
|
85
85
|
The prefix for subwords.
|
|
86
|
-
vocab (`dict`, *optional*):
|
|
86
|
+
vocab (`str` or `dict[str, int]`, *optional*):
|
|
87
87
|
Custom vocabulary dictionary.
|
|
88
88
|
"""
|
|
89
89
|
|
|
90
90
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
91
|
-
|
|
91
|
+
model = WordPiece
|
|
92
92
|
cls_token_type_id: int = 2
|
|
93
93
|
|
|
94
94
|
def __init__(
|
|
95
95
|
self,
|
|
96
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
96
97
|
do_lower_case: bool = True,
|
|
97
98
|
unk_token: str = "<unk>",
|
|
98
99
|
sep_token: str = "<sep>",
|
|
@@ -105,23 +106,18 @@ class FunnelTokenizer(TokenizersBackend):
|
|
|
105
106
|
tokenize_chinese_chars: bool = True,
|
|
106
107
|
strip_accents: Optional[bool] = None,
|
|
107
108
|
wordpieces_prefix: str = "##",
|
|
108
|
-
vocab: Optional[dict] = None,
|
|
109
|
-
vocab_file: Optional[str] = None,
|
|
110
109
|
**kwargs,
|
|
111
110
|
):
|
|
112
|
-
self.vocab_file = vocab_file
|
|
113
111
|
self.do_lower_case = do_lower_case
|
|
114
112
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
|
115
113
|
self.strip_accents = strip_accents
|
|
116
114
|
self.clean_text = clean_text
|
|
117
115
|
self.wordpieces_prefix = wordpieces_prefix
|
|
118
116
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
else:
|
|
124
|
-
self._vocab = {
|
|
117
|
+
self._vocab = (
|
|
118
|
+
vocab
|
|
119
|
+
if vocab is not None
|
|
120
|
+
else {
|
|
125
121
|
str(pad_token): 0,
|
|
126
122
|
str(unk_token): 1,
|
|
127
123
|
str(cls_token): 2,
|
|
@@ -130,6 +126,7 @@ class FunnelTokenizer(TokenizersBackend):
|
|
|
130
126
|
str(bos_token): 5,
|
|
131
127
|
str(eos_token): 6,
|
|
132
128
|
}
|
|
129
|
+
)
|
|
133
130
|
|
|
134
131
|
self._tokenizer = Tokenizer(WordPiece(self._vocab, unk_token=str(unk_token)))
|
|
135
132
|
|
|
@@ -142,19 +139,7 @@ class FunnelTokenizer(TokenizersBackend):
|
|
|
142
139
|
self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
|
143
140
|
self._tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
|
|
144
141
|
|
|
145
|
-
self._tokenizer.post_processor = processors.TemplateProcessing(
|
|
146
|
-
single=f"{cls_token}:2 $A:0 {sep_token}:0", # token_type_id is 2 for Funnel transformer
|
|
147
|
-
pair=f"{cls_token}:2 $A:0 {sep_token}:0 $B:1 {sep_token}:1",
|
|
148
|
-
special_tokens=[
|
|
149
|
-
(str(cls_token), self._vocab.get(str(cls_token), 2)),
|
|
150
|
-
(str(sep_token), self._vocab.get(str(sep_token), 3)),
|
|
151
|
-
],
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
tokenizer_object = self._tokenizer
|
|
155
|
-
|
|
156
142
|
super().__init__(
|
|
157
|
-
tokenizer_object=tokenizer_object,
|
|
158
143
|
do_lower_case=do_lower_case,
|
|
159
144
|
unk_token=unk_token,
|
|
160
145
|
sep_token=sep_token,
|
|
@@ -169,6 +154,14 @@ class FunnelTokenizer(TokenizersBackend):
|
|
|
169
154
|
wordpieces_prefix=wordpieces_prefix,
|
|
170
155
|
**kwargs,
|
|
171
156
|
)
|
|
157
|
+
self._tokenizer.post_processor = processors.TemplateProcessing(
|
|
158
|
+
single=f"{cls_token}:2 $A:0 {sep_token}:0", # token_type_id is 2 for Funnel transformer
|
|
159
|
+
pair=f"{cls_token}:2 $A:0 {sep_token}:0 $B:1 {sep_token}:1",
|
|
160
|
+
special_tokens=[
|
|
161
|
+
(str(cls_token), self.cls_token_id),
|
|
162
|
+
(str(sep_token), self.sep_token_id),
|
|
163
|
+
],
|
|
164
|
+
)
|
|
172
165
|
|
|
173
166
|
|
|
174
167
|
__all__ = ["FunnelTokenizer"]
|
|
@@ -94,7 +94,7 @@ class FuyuBatchFeature(BatchFeature):
|
|
|
94
94
|
The outputs dictionary from the processors contains a mix of tensors and lists of tensors.
|
|
95
95
|
"""
|
|
96
96
|
|
|
97
|
-
def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
|
|
97
|
+
def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None, **kwargs):
|
|
98
98
|
"""
|
|
99
99
|
Convert the inner content to tensors.
|
|
100
100
|
|
|
@@ -359,6 +359,7 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
|
|
|
359
359
|
image_patches=None,
|
|
360
360
|
image_patches_indices=None,
|
|
361
361
|
cache_position=None,
|
|
362
|
+
is_first_iteration=False,
|
|
362
363
|
**kwargs,
|
|
363
364
|
):
|
|
364
365
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -371,10 +372,11 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
|
|
|
371
372
|
image_patches=image_patches,
|
|
372
373
|
image_patches_indices=image_patches_indices,
|
|
373
374
|
cache_position=cache_position,
|
|
375
|
+
is_first_iteration=is_first_iteration,
|
|
374
376
|
**kwargs,
|
|
375
377
|
)
|
|
376
378
|
|
|
377
|
-
if
|
|
379
|
+
if not is_first_iteration and kwargs.get("use_cache", True):
|
|
378
380
|
# set image_patches and image_patches_indices to `None` for decoding stage
|
|
379
381
|
model_inputs["image_patches_indices"] = None
|
|
380
382
|
model_inputs["image_patches"] = None
|
|
@@ -337,16 +337,32 @@ class FuyuProcessor(ProcessorMixin):
|
|
|
337
337
|
r"""
|
|
338
338
|
Constructs a Fuyu processor which wraps a Fuyu image processor and a Llama tokenizer into a single processor.
|
|
339
339
|
|
|
340
|
-
[`FuyuProcessor`] offers all the functionalities of [`FuyuImageProcessor`] and [`
|
|
340
|
+
[`FuyuProcessor`] offers all the functionalities of [`FuyuImageProcessor`] and [`TokenizersBackend`]. See the
|
|
341
341
|
[`~FuyuProcessor.__call__`] and [`~FuyuProcessor.decode`] for more information.
|
|
342
342
|
|
|
343
343
|
Args:
|
|
344
344
|
image_processor ([`FuyuImageProcessor`]):
|
|
345
345
|
The image processor is a required input.
|
|
346
|
-
tokenizer ([`
|
|
346
|
+
tokenizer ([`TokenizersBackend`]):
|
|
347
347
|
The tokenizer is a required input.
|
|
348
348
|
"""
|
|
349
349
|
|
|
350
|
+
@classmethod
|
|
351
|
+
def _load_tokenizer_from_pretrained(
|
|
352
|
+
cls, sub_processor_type, pretrained_model_name_or_path, subfolder="", **kwargs
|
|
353
|
+
):
|
|
354
|
+
"""
|
|
355
|
+
Override for BC. Fuyu uses TokenizersBackend and requires token_type_ids to be removed from model_input_names
|
|
356
|
+
because Fuyu uses mm_token_type_ids instead for multimodal token identification. `
|
|
357
|
+
"""
|
|
358
|
+
from ...tokenization_utils_tokenizers import TokenizersBackend
|
|
359
|
+
|
|
360
|
+
tokenizer = TokenizersBackend.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
361
|
+
# Remove token_type_ids as Fuyu uses mm_token_type_ids instead
|
|
362
|
+
if "token_type_ids" in tokenizer.model_input_names:
|
|
363
|
+
tokenizer.model_input_names.remove("token_type_ids")
|
|
364
|
+
return tokenizer
|
|
365
|
+
|
|
350
366
|
def __init__(self, image_processor, tokenizer, **kwargs):
|
|
351
367
|
super().__init__(image_processor=image_processor, tokenizer=tokenizer)
|
|
352
368
|
self.image_processor = image_processor
|
|
@@ -486,7 +502,7 @@ class FuyuProcessor(ProcessorMixin):
|
|
|
486
502
|
) -> "FuyuBatchFeature":
|
|
487
503
|
"""
|
|
488
504
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
|
489
|
-
and `kwargs` arguments to
|
|
505
|
+
and `kwargs` arguments to TokenizersBackend's [`~TokenizersBackend.__call__`] if `text` is not `None` to
|
|
490
506
|
encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
|
|
491
507
|
FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
|
492
508
|
of the above two methods for more information.
|
|
@@ -29,7 +29,7 @@ from ... import initialization as init
|
|
|
29
29
|
from ...activations import ACT2FN
|
|
30
30
|
from ...cache_utils import Cache, DynamicCache
|
|
31
31
|
from ...generation import GenerationMixin
|
|
32
|
-
from ...integrations import use_kernel_func_from_hub
|
|
32
|
+
from ...integrations import use_kernel_func_from_hub, use_kernelized_func
|
|
33
33
|
from ...masking_utils import create_causal_mask
|
|
34
34
|
from ...modeling_layers import (
|
|
35
35
|
GenericForSequenceClassification,
|
|
@@ -41,7 +41,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
41
41
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
42
42
|
from ...processing_utils import Unpack
|
|
43
43
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
44
|
-
from ...utils.generic import check_model_inputs
|
|
44
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
45
45
|
from .configuration_gemma import GemmaConfig
|
|
46
46
|
|
|
47
47
|
|
|
@@ -98,7 +98,7 @@ class GemmaRotaryEmbedding(nn.Module):
|
|
|
98
98
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
99
99
|
|
|
100
100
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
101
|
-
self.original_inv_freq =
|
|
101
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
102
102
|
|
|
103
103
|
@staticmethod
|
|
104
104
|
def compute_default_rope_parameters(
|
|
@@ -137,7 +137,7 @@ class GemmaRotaryEmbedding(nn.Module):
|
|
|
137
137
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
138
138
|
|
|
139
139
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
140
|
-
with
|
|
140
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
141
141
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
142
142
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
143
143
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -219,6 +219,7 @@ def eager_attention_forward(
|
|
|
219
219
|
return attn_output, attn_weights
|
|
220
220
|
|
|
221
221
|
|
|
222
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
222
223
|
class GemmaAttention(nn.Module):
|
|
223
224
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
224
225
|
|
|
@@ -244,7 +245,6 @@ class GemmaAttention(nn.Module):
|
|
|
244
245
|
self.o_proj = nn.Linear(
|
|
245
246
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
246
247
|
)
|
|
247
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
248
248
|
|
|
249
249
|
def forward(
|
|
250
250
|
self,
|
|
@@ -410,16 +410,14 @@ class GemmaModel(GemmaPreTrainedModel):
|
|
|
410
410
|
if position_ids is None:
|
|
411
411
|
position_ids = cache_position.unsqueeze(0)
|
|
412
412
|
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
position_ids=position_ids,
|
|
422
|
-
)
|
|
413
|
+
causal_mask = create_causal_mask(
|
|
414
|
+
config=self.config,
|
|
415
|
+
input_embeds=inputs_embeds,
|
|
416
|
+
attention_mask=attention_mask,
|
|
417
|
+
cache_position=cache_position,
|
|
418
|
+
past_key_values=past_key_values,
|
|
419
|
+
position_ids=position_ids,
|
|
420
|
+
)
|
|
423
421
|
|
|
424
422
|
# embed positions
|
|
425
423
|
hidden_states = inputs_embeds
|
|
@@ -434,7 +432,7 @@ class GemmaModel(GemmaPreTrainedModel):
|
|
|
434
432
|
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
|
|
435
433
|
hidden_states = decoder_layer(
|
|
436
434
|
hidden_states,
|
|
437
|
-
attention_mask=
|
|
435
|
+
attention_mask=causal_mask,
|
|
438
436
|
position_ids=position_ids,
|
|
439
437
|
past_key_values=past_key_values,
|
|
440
438
|
use_cache=use_cache,
|
|
@@ -267,16 +267,14 @@ class GemmaModel(LlamaModel):
|
|
|
267
267
|
if position_ids is None:
|
|
268
268
|
position_ids = cache_position.unsqueeze(0)
|
|
269
269
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
position_ids=position_ids,
|
|
279
|
-
)
|
|
270
|
+
causal_mask = create_causal_mask(
|
|
271
|
+
config=self.config,
|
|
272
|
+
input_embeds=inputs_embeds,
|
|
273
|
+
attention_mask=attention_mask,
|
|
274
|
+
cache_position=cache_position,
|
|
275
|
+
past_key_values=past_key_values,
|
|
276
|
+
position_ids=position_ids,
|
|
277
|
+
)
|
|
280
278
|
|
|
281
279
|
# embed positions
|
|
282
280
|
hidden_states = inputs_embeds
|
|
@@ -291,7 +289,7 @@ class GemmaModel(LlamaModel):
|
|
|
291
289
|
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
|
|
292
290
|
hidden_states = decoder_layer(
|
|
293
291
|
hidden_states,
|
|
294
|
-
attention_mask=
|
|
292
|
+
attention_mask=causal_mask,
|
|
295
293
|
position_ids=position_ids,
|
|
296
294
|
past_key_values=past_key_values,
|
|
297
295
|
use_cache=use_cache,
|