transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
2
|
+
# This file was automatically generated from src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py.
|
|
3
|
+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
|
4
|
+
# the file from the modular. If any change should be done, please apply the change to the
|
|
5
|
+
# modular_paddleocr_vl.py file directly. One of our CI enforces this.
|
|
6
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
7
|
+
# Copyright 2025 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
|
|
8
|
+
#
|
|
9
|
+
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
|
10
|
+
# and OPT implementations in this library. It has been modified from its
|
|
11
|
+
# original forms to accommodate minor architectural differences compared
|
|
12
|
+
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
|
13
|
+
#
|
|
14
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
15
|
+
# you may not use this file except in compliance with the License.
|
|
16
|
+
# You may obtain a copy of the License at
|
|
17
|
+
#
|
|
18
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
19
|
+
#
|
|
20
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
21
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
22
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
23
|
+
# See the License for the specific language governing permissions and
|
|
24
|
+
# limitations under the License.
|
|
25
|
+
|
|
26
|
+
from typing import Union
|
|
27
|
+
|
|
28
|
+
from ...image_processing_utils import BatchFeature
|
|
29
|
+
from ...image_utils import ImageInput
|
|
30
|
+
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
|
31
|
+
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PaddleOCRVLProcessorKwargs(ProcessingKwargs, total=False):
|
|
35
|
+
_defaults = {
|
|
36
|
+
"text_kwargs": {
|
|
37
|
+
"padding": False,
|
|
38
|
+
},
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class PaddleOCRVLProcessor(ProcessorMixin):
|
|
43
|
+
r"""
|
|
44
|
+
[`PaddleOCRVLProcessor`] offers all the functionalities of [`PaddleOCRVLImageProcessor`] and [`LLamaTokenizerFast`]. See the
|
|
45
|
+
[`~PaddleOCRVLProcessor.__call__`] and [`~PaddleOCRVLProcessor.decode`] for more information.
|
|
46
|
+
Args:
|
|
47
|
+
image_processor ([`PaddleOCRVLImageProcessor`], *optional*):
|
|
48
|
+
The image processor is a required input.
|
|
49
|
+
tokenizer ([`LLamaTokenizerFast`], *optional*):
|
|
50
|
+
The tokenizer is a required input.
|
|
51
|
+
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
|
|
52
|
+
in a chat into a tokenizable string.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
image_processor_class = "AutoImageProcessor"
|
|
56
|
+
tokenizer_class = "AutoTokenizer"
|
|
57
|
+
|
|
58
|
+
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
|
|
59
|
+
self.image_token = tokenizer.image_token
|
|
60
|
+
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
|
61
|
+
|
|
62
|
+
def __call__(
|
|
63
|
+
self,
|
|
64
|
+
images: ImageInput = None,
|
|
65
|
+
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
|
66
|
+
**kwargs: Unpack[PaddleOCRVLProcessorKwargs],
|
|
67
|
+
) -> BatchFeature:
|
|
68
|
+
"""
|
|
69
|
+
Args:
|
|
70
|
+
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
|
71
|
+
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
|
72
|
+
tensor. Both channels-first and channels-last formats are supported.
|
|
73
|
+
text (`str`, `List[str]`, `List[List[str]]`):
|
|
74
|
+
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
|
75
|
+
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
|
76
|
+
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
|
77
|
+
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
78
|
+
If set, will return tensors of a particular framework. Acceptable values are:
|
|
79
|
+
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
|
80
|
+
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
|
81
|
+
- `'np'`: Return NumPy `np.ndarray` objects.
|
|
82
|
+
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
|
86
|
+
|
|
87
|
+
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
|
88
|
+
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
|
89
|
+
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
|
90
|
+
`None`).
|
|
91
|
+
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
|
92
|
+
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
|
|
93
|
+
"""
|
|
94
|
+
output_kwargs = self._merge_kwargs(
|
|
95
|
+
PaddleOCRVLProcessorKwargs,
|
|
96
|
+
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
|
97
|
+
**kwargs,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if images is not None:
|
|
101
|
+
image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
|
|
102
|
+
image_grid_thw = image_inputs["image_grid_thw"]
|
|
103
|
+
|
|
104
|
+
else:
|
|
105
|
+
image_inputs = {}
|
|
106
|
+
image_grid_thw = None
|
|
107
|
+
|
|
108
|
+
if not isinstance(text, list):
|
|
109
|
+
text = [text]
|
|
110
|
+
|
|
111
|
+
text = text.copy()
|
|
112
|
+
|
|
113
|
+
if image_grid_thw is not None:
|
|
114
|
+
index = 0
|
|
115
|
+
for i in range(len(text)):
|
|
116
|
+
while self.image_token in text[i]:
|
|
117
|
+
text[i] = text[i].replace(
|
|
118
|
+
self.image_token,
|
|
119
|
+
"<|placeholder|>"
|
|
120
|
+
* (
|
|
121
|
+
image_grid_thw[index].prod()
|
|
122
|
+
// self.image_processor.merge_size
|
|
123
|
+
// self.image_processor.merge_size
|
|
124
|
+
),
|
|
125
|
+
1,
|
|
126
|
+
)
|
|
127
|
+
index += 1
|
|
128
|
+
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
|
129
|
+
|
|
130
|
+
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
|
131
|
+
|
|
132
|
+
return BatchFeature(data={**text_inputs, **image_inputs})
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
__all__ = ["PaddleOCRVLProcessor"]
|
|
@@ -149,7 +149,8 @@ def create_causal_mask_mapping(
|
|
|
149
149
|
position_ids: Optional[torch.Tensor],
|
|
150
150
|
token_type_ids: Optional[torch.Tensor] = None,
|
|
151
151
|
pixel_values: Optional[torch.FloatTensor] = None,
|
|
152
|
-
is_training: bool = False,
|
|
152
|
+
is_training: Optional[bool] = False,
|
|
153
|
+
is_first_iteration: Optional[bool] = None,
|
|
153
154
|
**kwargs,
|
|
154
155
|
) -> dict:
|
|
155
156
|
"""
|
|
@@ -169,31 +170,33 @@ def create_causal_mask_mapping(
|
|
|
169
170
|
"past_key_values": past_key_values,
|
|
170
171
|
"position_ids": position_ids,
|
|
171
172
|
}
|
|
172
|
-
#
|
|
173
|
-
#
|
|
174
|
-
#
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
173
|
+
# Infer if prefill or decoding stage, if the flag isn't passed. This happens only when the mask is constructed
|
|
174
|
+
# from `forward` call. If users run a `forward` call, we have no option to infer `is_first_iteration` because users may be
|
|
175
|
+
# running generation with custom loop. Thus we need to infer it in a `non-perfect` way
|
|
176
|
+
# NOTE: Determining prefill in that case requires checking data values, which is not compile-compatible.
|
|
177
|
+
is_first_iteration = (
|
|
178
|
+
is_first_iteration
|
|
179
|
+
if is_first_iteration
|
|
180
|
+
else (past_key_values is None or not past_key_values.is_initialized or pixel_values is not None)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
178
184
|
if token_type_ids is not None:
|
|
179
185
|
# The logic bellow was originally written for Gemma3, where `token_type_ids` is reversed. Let's reverse
|
|
180
186
|
# it to then use exactly the same logic.
|
|
181
187
|
token_type_ids = 1 - token_type_ids
|
|
182
188
|
else:
|
|
183
189
|
logger.warning_once(
|
|
184
|
-
"
|
|
190
|
+
"It is a prefill stage but The `token_type_ids` is not provided. We recommend "
|
|
185
191
|
"passing `token_type_ids` to the model to prevent bad attention masking."
|
|
186
192
|
)
|
|
187
|
-
# BC: when NOT training, use bidirectional mask if sequence length > 1. Otherwise, use the default causal
|
|
188
|
-
# mask. This is incorrect in some advanced use cases, hence the warning above.
|
|
189
193
|
# NOTE: this branch can't be reached when training because `token_type_ids` is required as a model input.
|
|
190
|
-
|
|
191
|
-
token_type_ids = torch.ones_like(input_embeds)[:, :, 0]
|
|
194
|
+
token_type_ids = torch.ones_like(input_embeds)[:, :, 0]
|
|
192
195
|
|
|
193
196
|
# Logic originally copied from Gemma3. It holds up for Paligemma as well because Paligemma assumes up to one image
|
|
194
197
|
# per prompt AND we reverse `token_type_ids` above. Gemma3 uses a bidirectional mask for images, tagged through
|
|
195
198
|
# `token_type_ids` 1s.
|
|
196
|
-
if token_type_ids is not None and
|
|
199
|
+
if token_type_ids is not None and is_first_iteration:
|
|
197
200
|
# We need to pass an additional mask function to account for token type ids, and it needs to be an `or` (to
|
|
198
201
|
# undo the causal masking)
|
|
199
202
|
|
|
@@ -550,6 +553,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
|
|
550
553
|
use_cache=True,
|
|
551
554
|
logits_to_keep=None,
|
|
552
555
|
labels=None,
|
|
556
|
+
is_first_iteration=False,
|
|
553
557
|
**kwargs,
|
|
554
558
|
):
|
|
555
559
|
# Overwritten -- custom `position_ids` and `pixel_values` handling
|
|
@@ -563,6 +567,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
|
|
563
567
|
use_cache=use_cache,
|
|
564
568
|
logits_to_keep=logits_to_keep,
|
|
565
569
|
token_type_ids=token_type_ids,
|
|
570
|
+
is_first_iteration=is_first_iteration,
|
|
566
571
|
**kwargs,
|
|
567
572
|
)
|
|
568
573
|
|
|
@@ -570,9 +575,11 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
|
|
570
575
|
if model_inputs.get("position_ids") is not None:
|
|
571
576
|
model_inputs["position_ids"] += 1
|
|
572
577
|
|
|
573
|
-
#
|
|
574
|
-
#
|
|
575
|
-
|
|
578
|
+
# Pixel values are used only in the first iteration if available
|
|
579
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
580
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
581
|
+
# iteration with a question and cached system prompt (continue generate from cache). NOTE: use_cache=False needs pixel_values always
|
|
582
|
+
if is_first_iteration or not use_cache:
|
|
576
583
|
model_inputs["pixel_values"] = pixel_values
|
|
577
584
|
|
|
578
585
|
return model_inputs
|
|
@@ -586,6 +593,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
|
|
586
593
|
past_key_values: Optional[Cache],
|
|
587
594
|
position_ids: Optional[torch.Tensor],
|
|
588
595
|
token_type_ids: Optional[torch.Tensor] = None,
|
|
596
|
+
is_first_iteration: Optional[bool] = False,
|
|
589
597
|
**kwargs,
|
|
590
598
|
) -> dict:
|
|
591
599
|
# Uses the overwritten `create_masks_for_generate` with `token_type_ids` masking
|
|
@@ -597,7 +605,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
|
|
597
605
|
past_key_values,
|
|
598
606
|
position_ids,
|
|
599
607
|
token_type_ids,
|
|
600
|
-
|
|
608
|
+
is_first_iteration=is_first_iteration,
|
|
601
609
|
**{k: v for k, v in kwargs.items() if k != "pixel_values"},
|
|
602
610
|
)
|
|
603
611
|
|
|
@@ -121,9 +121,6 @@ class ParakeetEncoderConfig(PreTrainedConfig):
|
|
|
121
121
|
initializer_range=0.02,
|
|
122
122
|
**kwargs,
|
|
123
123
|
):
|
|
124
|
-
super().__init__(
|
|
125
|
-
**kwargs,
|
|
126
|
-
)
|
|
127
124
|
self.hidden_size = hidden_size
|
|
128
125
|
self.num_hidden_layers = num_hidden_layers
|
|
129
126
|
self.num_attention_heads = num_attention_heads
|
|
@@ -133,10 +130,7 @@ class ParakeetEncoderConfig(PreTrainedConfig):
|
|
|
133
130
|
self.attention_bias = attention_bias
|
|
134
131
|
self.convolution_bias = convolution_bias
|
|
135
132
|
|
|
136
|
-
if (conv_kernel_size - 1) % 2 != 0:
|
|
137
|
-
raise ValueError(f"conv_kernel_size must be odd, got {conv_kernel_size}")
|
|
138
133
|
self.conv_kernel_size = conv_kernel_size
|
|
139
|
-
|
|
140
134
|
self.subsampling_conv_kernel_size = subsampling_conv_kernel_size
|
|
141
135
|
self.subsampling_conv_stride = subsampling_conv_stride
|
|
142
136
|
|
|
@@ -153,6 +147,10 @@ class ParakeetEncoderConfig(PreTrainedConfig):
|
|
|
153
147
|
self.scale_input = scale_input
|
|
154
148
|
self.initializer_range = initializer_range
|
|
155
149
|
|
|
150
|
+
super().__init__(
|
|
151
|
+
**kwargs,
|
|
152
|
+
)
|
|
153
|
+
|
|
156
154
|
|
|
157
155
|
class ParakeetCTCConfig(PreTrainedConfig):
|
|
158
156
|
r"""
|
|
@@ -29,13 +29,13 @@ from torch import nn
|
|
|
29
29
|
|
|
30
30
|
from ... import initialization as init
|
|
31
31
|
from ...activations import ACT2FN
|
|
32
|
-
from ...integrations import use_kernel_func_from_hub
|
|
32
|
+
from ...integrations import use_kernel_func_from_hub, use_kernelized_func
|
|
33
33
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
34
34
|
from ...modeling_outputs import BaseModelOutput, CausalLMOutput
|
|
35
35
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
36
36
|
from ...processing_utils import Unpack
|
|
37
37
|
from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple
|
|
38
|
-
from ...utils.generic import check_model_inputs
|
|
38
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
39
39
|
from .configuration_parakeet import ParakeetCTCConfig, ParakeetEncoderConfig
|
|
40
40
|
|
|
41
41
|
|
|
@@ -88,7 +88,7 @@ class ParakeetEncoderRelPositionalEncoding(nn.Module):
|
|
|
88
88
|
if isinstance(hidden_states.device.type, str) and hidden_states.device.type != "mps"
|
|
89
89
|
else "cpu"
|
|
90
90
|
)
|
|
91
|
-
with
|
|
91
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
92
92
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
93
93
|
sin = freqs.sin()
|
|
94
94
|
cos = freqs.cos()
|
|
@@ -155,7 +155,7 @@ class ParakeetEncoderConvolutionModule(nn.Module):
|
|
|
155
155
|
|
|
156
156
|
Args:
|
|
157
157
|
hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.
|
|
158
|
-
attention_mask (`torch.Tensor` of shape `(batch, 1, time)`): Attention mask.
|
|
158
|
+
attention_mask (`torch.Tensor` of shape `(batch, 1, time, time)`): Attention mask.
|
|
159
159
|
|
|
160
160
|
Returns:
|
|
161
161
|
`torch.Tensor`: Output tensor of shape `(batch, time, channels)`.
|
|
@@ -171,7 +171,10 @@ class ParakeetEncoderConvolutionModule(nn.Module):
|
|
|
171
171
|
|
|
172
172
|
# Apply padding mask before convolution
|
|
173
173
|
if attention_mask is not None:
|
|
174
|
-
|
|
174
|
+
if attention_mask.dtype == torch.bool:
|
|
175
|
+
all_masked_rows = torch.all(~attention_mask, dim=2)
|
|
176
|
+
else:
|
|
177
|
+
all_masked_rows = torch.all(~(attention_mask == 0.0), dim=2)
|
|
175
178
|
hidden_states = hidden_states.masked_fill(all_masked_rows, 0.0)
|
|
176
179
|
|
|
177
180
|
# 1D Depthwise Conv
|
|
@@ -256,6 +259,7 @@ def eager_attention_forward(
|
|
|
256
259
|
return attn_output, attn_weights
|
|
257
260
|
|
|
258
261
|
|
|
262
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
259
263
|
class ParakeetEncoderAttention(nn.Module):
|
|
260
264
|
"""Multi-head attention with relative positional encoding. See section 3.3 of https://huggingface.co/papers/1901.02860."""
|
|
261
265
|
|
|
@@ -281,7 +285,6 @@ class ParakeetEncoderAttention(nn.Module):
|
|
|
281
285
|
self.o_proj = nn.Linear(
|
|
282
286
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
283
287
|
)
|
|
284
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
285
288
|
# W_{k,R} projection
|
|
286
289
|
self.relative_k_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
|
|
287
290
|
# global content bias
|
|
@@ -507,6 +510,11 @@ class ParakeetPreTrainedModel(PreTrainedModel):
|
|
|
507
510
|
# Initialize positional bias parameters
|
|
508
511
|
init.normal_(module.bias_u, mean=0.0, std=std)
|
|
509
512
|
init.normal_(module.bias_v, mean=0.0, std=std)
|
|
513
|
+
elif isinstance(module, ParakeetEncoderRelPositionalEncoding):
|
|
514
|
+
inv_freq = 1.0 / (
|
|
515
|
+
10000.0 ** (torch.arange(0, self.config.hidden_size, 2, dtype=torch.int64) / self.config.hidden_size)
|
|
516
|
+
)
|
|
517
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
510
518
|
|
|
511
519
|
def _get_subsampling_output_length(self, input_lengths: torch.Tensor):
|
|
512
520
|
encoder_config = self.config.encoder_config if isinstance(self.config, ParakeetCTCConfig) else self.config
|
|
@@ -29,7 +29,7 @@ from ...modeling_outputs import BaseModelOutput, CausalLMOutput
|
|
|
29
29
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
30
30
|
from ...processing_utils import Unpack
|
|
31
31
|
from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple
|
|
32
|
-
from ...utils.generic import check_model_inputs
|
|
32
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
33
33
|
from ..fastspeech2_conformer.modeling_fastspeech2_conformer import FastSpeech2ConformerConvolutionModule
|
|
34
34
|
from ..llama.modeling_llama import LlamaAttention, eager_attention_forward
|
|
35
35
|
from .configuration_parakeet import ParakeetCTCConfig, ParakeetEncoderConfig
|
|
@@ -84,7 +84,7 @@ class ParakeetEncoderRelPositionalEncoding(nn.Module):
|
|
|
84
84
|
if isinstance(hidden_states.device.type, str) and hidden_states.device.type != "mps"
|
|
85
85
|
else "cpu"
|
|
86
86
|
)
|
|
87
|
-
with
|
|
87
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
88
88
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
89
89
|
sin = freqs.sin()
|
|
90
90
|
cos = freqs.cos()
|
|
@@ -346,6 +346,11 @@ class ParakeetPreTrainedModel(PreTrainedModel):
|
|
|
346
346
|
# Initialize positional bias parameters
|
|
347
347
|
init.normal_(module.bias_u, mean=0.0, std=std)
|
|
348
348
|
init.normal_(module.bias_v, mean=0.0, std=std)
|
|
349
|
+
elif isinstance(module, ParakeetEncoderRelPositionalEncoding):
|
|
350
|
+
inv_freq = 1.0 / (
|
|
351
|
+
10000.0 ** (torch.arange(0, self.config.hidden_size, 2, dtype=torch.int64) / self.config.hidden_size)
|
|
352
|
+
)
|
|
353
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
349
354
|
|
|
350
355
|
def _get_subsampling_output_length(self, input_lengths: torch.Tensor):
|
|
351
356
|
encoder_config = self.config.encoder_config if isinstance(self.config, ParakeetCTCConfig) else self.config
|
|
@@ -16,10 +16,10 @@
|
|
|
16
16
|
import itertools
|
|
17
17
|
from typing import Optional, Union
|
|
18
18
|
|
|
19
|
-
from ...tokenization_utils_tokenizers import
|
|
19
|
+
from ...tokenization_utils_tokenizers import TokenizersBackend
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
class
|
|
22
|
+
class ParakeetTokenizer(TokenizersBackend):
|
|
23
23
|
"""
|
|
24
24
|
Inherits all methods from [`PreTrainedTokenizerFast`]. Users should refer to this superclass for more information regarding those methods,
|
|
25
25
|
except for `_decode` which is overridden to adapt it to CTC decoding:
|
|
@@ -51,4 +51,4 @@ class ParakeetTokenizerFast(PreTrainedTokenizerFast):
|
|
|
51
51
|
)
|
|
52
52
|
|
|
53
53
|
|
|
54
|
-
__all__ = ["
|
|
54
|
+
__all__ = ["ParakeetTokenizer"]
|
|
@@ -696,6 +696,10 @@ class PatchTSMixerPreTrainedModel(PreTrainedModel):
|
|
|
696
696
|
elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
|
|
697
697
|
init.zeros_(module.bias)
|
|
698
698
|
init.ones_(module.weight)
|
|
699
|
+
if getattr(module, "running_mean", None) is not None:
|
|
700
|
+
init.zeros_(module.running_mean)
|
|
701
|
+
init.ones_(module.running_var)
|
|
702
|
+
init.zeros_(module.num_batches_tracked)
|
|
699
703
|
elif isinstance(module, PatchTSMixerBatchNorm):
|
|
700
704
|
init.zeros_(module.batchnorm.bias)
|
|
701
705
|
init.ones_(module.batchnorm.weight)
|
|
@@ -1141,6 +1145,7 @@ class PatchTSMixerEncoder(PatchTSMixerPreTrainedModel):
|
|
|
1141
1145
|
past_values: torch.Tensor,
|
|
1142
1146
|
output_hidden_states: Optional[bool] = False,
|
|
1143
1147
|
return_dict: Optional[bool] = None,
|
|
1148
|
+
**kwargs,
|
|
1144
1149
|
) -> Union[tuple, PatchTSMixerEncoderOutput]:
|
|
1145
1150
|
r"""
|
|
1146
1151
|
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
|
|
@@ -1251,6 +1256,7 @@ class PatchTSMixerModel(PatchTSMixerPreTrainedModel):
|
|
|
1251
1256
|
observed_mask: Optional[torch.Tensor] = None,
|
|
1252
1257
|
output_hidden_states: Optional[bool] = False,
|
|
1253
1258
|
return_dict: Optional[bool] = None,
|
|
1259
|
+
**kwargs,
|
|
1254
1260
|
) -> PatchTSMixerModelOutput:
|
|
1255
1261
|
r"""
|
|
1256
1262
|
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
|
|
@@ -1362,6 +1368,7 @@ class PatchTSMixerForPretraining(PatchTSMixerPreTrainedModel):
|
|
|
1362
1368
|
output_hidden_states: Optional[bool] = False,
|
|
1363
1369
|
return_loss: bool = True,
|
|
1364
1370
|
return_dict: Optional[bool] = None,
|
|
1371
|
+
**kwargs,
|
|
1365
1372
|
) -> PatchTSMixerForPreTrainingOutput:
|
|
1366
1373
|
r"""
|
|
1367
1374
|
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
|
|
@@ -1574,6 +1581,7 @@ class PatchTSMixerForPrediction(PatchTSMixerPreTrainedModel):
|
|
|
1574
1581
|
output_hidden_states: Optional[bool] = False,
|
|
1575
1582
|
return_loss: bool = True,
|
|
1576
1583
|
return_dict: Optional[bool] = None,
|
|
1584
|
+
**kwargs,
|
|
1577
1585
|
) -> PatchTSMixerForPredictionOutput:
|
|
1578
1586
|
r"""
|
|
1579
1587
|
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
|
|
@@ -1797,6 +1805,7 @@ class PatchTSMixerForTimeSeriesClassification(PatchTSMixerPreTrainedModel):
|
|
|
1797
1805
|
output_hidden_states: Optional[bool] = False,
|
|
1798
1806
|
return_loss: bool = True,
|
|
1799
1807
|
return_dict: Optional[bool] = None,
|
|
1808
|
+
**kwargs,
|
|
1800
1809
|
) -> PatchTSMixerForTimeSeriesClassificationOutput:
|
|
1801
1810
|
r"""
|
|
1802
1811
|
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
|
|
@@ -1987,6 +1996,7 @@ class PatchTSMixerForRegression(PatchTSMixerPreTrainedModel):
|
|
|
1987
1996
|
output_hidden_states: Optional[bool] = False,
|
|
1988
1997
|
return_loss: bool = True,
|
|
1989
1998
|
return_dict: Optional[bool] = None,
|
|
1999
|
+
**kwargs,
|
|
1990
2000
|
) -> PatchTSMixerForRegressionOutput:
|
|
1991
2001
|
r"""
|
|
1992
2002
|
past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
|
|
@@ -24,6 +24,7 @@ from torch import nn
|
|
|
24
24
|
|
|
25
25
|
from ... import initialization as init
|
|
26
26
|
from ...activations import ACT2CLS
|
|
27
|
+
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
|
|
27
28
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
28
29
|
from ...modeling_outputs import BaseModelOutput
|
|
29
30
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
@@ -418,7 +419,7 @@ class PatchTSTEncoderLayer(nn.Module):
|
|
|
418
419
|
super().__init__()
|
|
419
420
|
|
|
420
421
|
self.channel_attention = config.channel_attention
|
|
421
|
-
|
|
422
|
+
|
|
422
423
|
self.self_attn = PatchTSTAttention(
|
|
423
424
|
embed_dim=config.d_model,
|
|
424
425
|
num_heads=config.num_attention_heads,
|
|
@@ -555,6 +556,9 @@ class PatchTSTPreTrainedModel(PreTrainedModel):
|
|
|
555
556
|
main_input_name = "past_values"
|
|
556
557
|
input_modalities = ("time",)
|
|
557
558
|
supports_gradient_checkpointing = False
|
|
559
|
+
_supports_flash_attn = True
|
|
560
|
+
_supports_sdpa = True
|
|
561
|
+
_supports_flex_attn = True
|
|
558
562
|
|
|
559
563
|
@torch.no_grad()
|
|
560
564
|
def _init_weights(self, module: nn.Module):
|
|
@@ -571,13 +575,22 @@ class PatchTSTPreTrainedModel(PreTrainedModel):
|
|
|
571
575
|
init.normal_(module.cls_token, std=0.02)
|
|
572
576
|
num_patches += 1
|
|
573
577
|
# initialize positional encoding
|
|
574
|
-
|
|
575
|
-
|
|
578
|
+
position_enc = module._init_pe(self.config, num_patches)
|
|
579
|
+
if is_deepspeed_zero3_enabled():
|
|
580
|
+
import deepspeed
|
|
581
|
+
|
|
582
|
+
with deepspeed.zero.GatheredParameters(module.position_enc, modifier_rank=None):
|
|
583
|
+
if module.position_enc.numel() > 0:
|
|
584
|
+
init.copy_(module.position_enc, position_enc)
|
|
585
|
+
else:
|
|
586
|
+
init.copy_(module.position_enc, position_enc)
|
|
587
|
+
elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
|
|
576
588
|
init.zeros_(module.bias)
|
|
577
589
|
init.ones_(module.weight)
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
590
|
+
if getattr(module, "running_mean", None) is not None:
|
|
591
|
+
init.zeros_(module.running_mean)
|
|
592
|
+
init.ones_(module.running_var)
|
|
593
|
+
init.zeros_(module.num_batches_tracked)
|
|
581
594
|
elif isinstance(module, nn.Linear):
|
|
582
595
|
init.normal_(module.weight, mean=0.0, std=self.config.init_std)
|
|
583
596
|
if module.bias is not None:
|
|
@@ -704,6 +717,7 @@ class PatchTSTEncoder(PatchTSTPreTrainedModel):
|
|
|
704
717
|
patch_input: torch.Tensor,
|
|
705
718
|
output_hidden_states: Optional[bool] = None,
|
|
706
719
|
output_attentions: Optional[bool] = None,
|
|
720
|
+
**kwargs,
|
|
707
721
|
) -> BaseModelOutput:
|
|
708
722
|
"""
|
|
709
723
|
Parameters:
|
|
@@ -1092,6 +1106,7 @@ class PatchTSTModel(PatchTSTPreTrainedModel):
|
|
|
1092
1106
|
output_hidden_states: Optional[bool] = None,
|
|
1093
1107
|
output_attentions: Optional[bool] = None,
|
|
1094
1108
|
return_dict: Optional[bool] = None,
|
|
1109
|
+
**kwargs,
|
|
1095
1110
|
) -> Union[tuple, PatchTSTModelOutput]:
|
|
1096
1111
|
r"""
|
|
1097
1112
|
Parameters:
|
|
@@ -1228,6 +1243,7 @@ class PatchTSTForPretraining(PatchTSTPreTrainedModel):
|
|
|
1228
1243
|
output_hidden_states: Optional[bool] = None,
|
|
1229
1244
|
output_attentions: Optional[bool] = None,
|
|
1230
1245
|
return_dict: Optional[bool] = None,
|
|
1246
|
+
**kwargs,
|
|
1231
1247
|
) -> Union[tuple, PatchTSTForPretrainingOutput]:
|
|
1232
1248
|
r"""
|
|
1233
1249
|
Parameters:
|
|
@@ -1387,6 +1403,7 @@ class PatchTSTForClassification(PatchTSTPreTrainedModel):
|
|
|
1387
1403
|
output_hidden_states: Optional[bool] = None,
|
|
1388
1404
|
output_attentions: Optional[bool] = None,
|
|
1389
1405
|
return_dict: Optional[bool] = None,
|
|
1406
|
+
**kwargs,
|
|
1390
1407
|
) -> Union[tuple, PatchTSTForClassificationOutput]:
|
|
1391
1408
|
r"""
|
|
1392
1409
|
past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
|
|
@@ -1594,6 +1611,7 @@ class PatchTSTForPrediction(PatchTSTPreTrainedModel):
|
|
|
1594
1611
|
output_hidden_states: Optional[bool] = None,
|
|
1595
1612
|
output_attentions: Optional[bool] = None,
|
|
1596
1613
|
return_dict: Optional[bool] = None,
|
|
1614
|
+
**kwargs,
|
|
1597
1615
|
) -> Union[tuple, PatchTSTForPredictionOutput]:
|
|
1598
1616
|
r"""
|
|
1599
1617
|
Parameters:
|
|
@@ -1840,6 +1858,7 @@ class PatchTSTForRegression(PatchTSTPreTrainedModel):
|
|
|
1840
1858
|
output_hidden_states: Optional[bool] = None,
|
|
1841
1859
|
output_attentions: Optional[bool] = None,
|
|
1842
1860
|
return_dict: Optional[bool] = None,
|
|
1861
|
+
**kwargs,
|
|
1843
1862
|
) -> Union[tuple, PatchTSTForRegressionOutput]:
|
|
1844
1863
|
r"""
|
|
1845
1864
|
past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
from typing import TYPE_CHECKING
|
|
16
|
+
|
|
17
|
+
from ...utils import _LazyModule
|
|
18
|
+
from ...utils.import_utils import define_import_structure
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from .configuration_pe_audio import *
|
|
23
|
+
from .feature_extraction_pe_audio import *
|
|
24
|
+
from .modeling_pe_audio import *
|
|
25
|
+
from .processing_pe_audio import *
|
|
26
|
+
else:
|
|
27
|
+
import sys
|
|
28
|
+
|
|
29
|
+
_file = globals()["__file__"]
|
|
30
|
+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|