transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
2
|
+
# This file was automatically generated from src/transformers/models/lasr/modular_lasr.py.
|
|
3
|
+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
|
4
|
+
# the file from the modular. If any change should be done, please apply the change to the
|
|
5
|
+
# modular_lasr.py file directly. One of our CI enforces this.
|
|
6
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
7
|
+
# coding=utf-8
|
|
8
|
+
# Copyright 2025 The HuggingFace Inc. team and Google LLC. All rights reserved.
|
|
9
|
+
#
|
|
10
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
11
|
+
# you may not use this file except in compliance with the License.
|
|
12
|
+
# You may obtain a copy of the License at
|
|
13
|
+
#
|
|
14
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
15
|
+
#
|
|
16
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
17
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
18
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
19
|
+
# See the License for the specific language governing permissions and
|
|
20
|
+
# limitations under the License.
|
|
21
|
+
|
|
22
|
+
from typing import Union
|
|
23
|
+
|
|
24
|
+
from ...configuration_utils import PreTrainedConfig
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LasrEncoderConfig(PreTrainedConfig):
|
|
28
|
+
r"""
|
|
29
|
+
This is the configuration class to store the configuration of a [`LasrEncoder`]. It is used to instantiate a
|
|
30
|
+
`LasrEncoder` model according to the specified arguments, defining the model architecture.
|
|
31
|
+
|
|
32
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
33
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
hidden_size (`int`, *optional*, defaults to 512):
|
|
37
|
+
Dimension of the layers and the hidden states.
|
|
38
|
+
num_hidden_layers (`int`, *optional*, defaults to 17):
|
|
39
|
+
Number of hidden layers in the Transformer encoder.
|
|
40
|
+
num_attention_heads (`int`, *optional*, defaults to 8):
|
|
41
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
|
42
|
+
intermediate_size (`int`, *optional*, defaults to 2048):
|
|
43
|
+
Dimension of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
|
44
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
|
45
|
+
The non-linear activation function (function or string) in the encoder and pooler.
|
|
46
|
+
attention_bias (`bool`, *optional*, defaults to `False`):
|
|
47
|
+
Whether to use bias in the attention layers.
|
|
48
|
+
convolution_bias (`bool`, *optional*, defaults to `False`):
|
|
49
|
+
Whether to use bias in convolutions of the conformer's convolution module.
|
|
50
|
+
conv_kernel_size (`int`, *optional*, defaults to 32):
|
|
51
|
+
The kernel size of the convolution layers in the Conformer block.
|
|
52
|
+
subsampling_conv_channels (`int`, *optional*, defaults to 256):
|
|
53
|
+
The number of channels in the subsampling convolution layers.
|
|
54
|
+
subsampling_conv_kernel_size (`int`, *optional*, defaults to 5):
|
|
55
|
+
The kernel size of the subsampling convolution layers.
|
|
56
|
+
subsampling_conv_stride (`int`, *optional*, defaults to 2):
|
|
57
|
+
The stride of the subsampling convolution layers.
|
|
58
|
+
num_mel_bins (`int`, *optional*, defaults to 128):
|
|
59
|
+
Number of mel features.
|
|
60
|
+
dropout (`float`, *optional*, defaults to 0.1):
|
|
61
|
+
The dropout ratio for all fully connected layers in the embeddings, encoder, and pooler.
|
|
62
|
+
dropout_positions (`float`, *optional*, defaults to 0.0):
|
|
63
|
+
The dropout ratio for the positions in the input sequence.
|
|
64
|
+
layerdrop (`float`, *optional*, defaults to 0.1):
|
|
65
|
+
The dropout ratio for the layers in the encoder.
|
|
66
|
+
activation_dropout (`float`, *optional*, defaults to 0.1):
|
|
67
|
+
The dropout ratio for activations inside the fully connected layer.
|
|
68
|
+
attention_dropout (`float`, *optional*, defaults to 0.1):
|
|
69
|
+
The dropout ratio for the attention layers.
|
|
70
|
+
max_position_embeddings (`int`, *optional*, defaults to 10000):
|
|
71
|
+
The maximum sequence length that this model might ever be used with.
|
|
72
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
73
|
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
74
|
+
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
|
|
75
|
+
The epsilon used by the layer normalization layers.
|
|
76
|
+
feed_forward_residual_weights (`tuple[float, float]`, *optional*, defaults to `[1.5, 0.5]`):
|
|
77
|
+
The residual weights for the feed forward layers.
|
|
78
|
+
conv_residual_weights (`tuple[float, float]`, *optional*, defaults to `[2.0, 1.0]`):
|
|
79
|
+
The residual weights for the convolution layers.
|
|
80
|
+
batch_norm_momentum (`float`, *optional*, defaults to 0.01):
|
|
81
|
+
The momentum for the batch normalization layers.
|
|
82
|
+
rope_parameters (`RopeParameters`, *optional*):
|
|
83
|
+
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
84
|
+
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
85
|
+
with longer `max_position_embeddings`.
|
|
86
|
+
|
|
87
|
+
Example:
|
|
88
|
+
```python
|
|
89
|
+
>>> from transformers import LasrEncoderModel, LasrEncoderConfig
|
|
90
|
+
|
|
91
|
+
>>> # Initializing a `LasrEncoder` configuration
|
|
92
|
+
>>> configuration = LasrEncoderConfig()
|
|
93
|
+
|
|
94
|
+
>>> # Initializing a model from the configuration
|
|
95
|
+
>>> model = LasrEncoderModel(configuration)
|
|
96
|
+
|
|
97
|
+
>>> # Accessing the model configuration
|
|
98
|
+
>>> configuration = model.config
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
This configuration class is based on the LasrEncoder architecture from Google Health AI. You can find more details
|
|
102
|
+
and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO).
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
model_type = "lasr_encoder"
|
|
106
|
+
keys_to_ignore_at_inference = ["past_key_values"]
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
hidden_size=512,
|
|
111
|
+
num_hidden_layers=17,
|
|
112
|
+
num_attention_heads=8,
|
|
113
|
+
intermediate_size=2048,
|
|
114
|
+
hidden_act="silu",
|
|
115
|
+
attention_bias=False,
|
|
116
|
+
convolution_bias=False,
|
|
117
|
+
conv_kernel_size=32,
|
|
118
|
+
subsampling_conv_channels=256,
|
|
119
|
+
subsampling_conv_kernel_size=5,
|
|
120
|
+
subsampling_conv_stride=2,
|
|
121
|
+
num_mel_bins=128,
|
|
122
|
+
dropout=0.1,
|
|
123
|
+
dropout_positions=0.0,
|
|
124
|
+
layerdrop=0.1,
|
|
125
|
+
activation_dropout=0.1,
|
|
126
|
+
attention_dropout=0.1,
|
|
127
|
+
max_position_embeddings=10000,
|
|
128
|
+
initializer_range=0.02,
|
|
129
|
+
layer_norm_eps=1e-6,
|
|
130
|
+
feed_forward_residual_weights=[1.5, 0.5],
|
|
131
|
+
conv_residual_weights=[2.0, 1.0],
|
|
132
|
+
batch_norm_momentum=0.01,
|
|
133
|
+
rope_parameters=None,
|
|
134
|
+
**kwargs,
|
|
135
|
+
):
|
|
136
|
+
self.rope_parameters = rope_parameters
|
|
137
|
+
self.layer_norm_eps = layer_norm_eps
|
|
138
|
+
self.feed_forward_residual_weights = feed_forward_residual_weights
|
|
139
|
+
self.conv_residual_weights = conv_residual_weights
|
|
140
|
+
self.batch_norm_momentum = batch_norm_momentum
|
|
141
|
+
self.hidden_size = hidden_size
|
|
142
|
+
self.num_hidden_layers = num_hidden_layers
|
|
143
|
+
self.num_attention_heads = num_attention_heads
|
|
144
|
+
self.num_key_value_heads = num_attention_heads # LlamaAttention compatibility
|
|
145
|
+
self.intermediate_size = intermediate_size
|
|
146
|
+
self.hidden_act = hidden_act
|
|
147
|
+
self.attention_bias = attention_bias
|
|
148
|
+
self.convolution_bias = convolution_bias
|
|
149
|
+
|
|
150
|
+
self.conv_kernel_size = conv_kernel_size
|
|
151
|
+
self.subsampling_conv_kernel_size = subsampling_conv_kernel_size
|
|
152
|
+
self.subsampling_conv_stride = subsampling_conv_stride
|
|
153
|
+
self.subsampling_conv_channels = subsampling_conv_channels
|
|
154
|
+
self.num_mel_bins = num_mel_bins
|
|
155
|
+
|
|
156
|
+
self.dropout = dropout
|
|
157
|
+
self.dropout_positions = dropout_positions
|
|
158
|
+
self.layerdrop = layerdrop
|
|
159
|
+
self.activation_dropout = activation_dropout
|
|
160
|
+
self.attention_dropout = attention_dropout
|
|
161
|
+
self.max_position_embeddings = max_position_embeddings
|
|
162
|
+
self.initializer_range = initializer_range
|
|
163
|
+
|
|
164
|
+
super().__init__(
|
|
165
|
+
**kwargs,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class LasrCTCConfig(PreTrainedConfig):
|
|
170
|
+
r"""
|
|
171
|
+
This is the configuration class to store the configuration of a [`LasrForCTC`]. It is used to instantiate a
|
|
172
|
+
Lasr CTC model according to the specified arguments, defining the model architecture.
|
|
173
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
174
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
175
|
+
Args:
|
|
176
|
+
vocab_size (`int`, *optional*, defaults to 512):
|
|
177
|
+
Vocabulary size of the model.
|
|
178
|
+
ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
|
|
179
|
+
Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
|
|
180
|
+
instance of [`LasrForCTC`].
|
|
181
|
+
ctc_zero_infinity (`bool`, *optional*, defaults to `True`):
|
|
182
|
+
Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
|
|
183
|
+
occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
|
|
184
|
+
of [`LasrForCTC`].
|
|
185
|
+
encoder_config (`Union[dict, LasrEncoderConfig]`, *optional*):
|
|
186
|
+
The config object or dictionary of the encoder.
|
|
187
|
+
pad_token_id (`int`, *optional*, defaults to 0):
|
|
188
|
+
Padding token id. Also used as blank token id.
|
|
189
|
+
Example:
|
|
190
|
+
```python
|
|
191
|
+
>>> from transformers import LasrForCTC, LasrCTCConfig
|
|
192
|
+
>>> # Initializing a Lasr configuration
|
|
193
|
+
>>> configuration = LasrCTCConfig()
|
|
194
|
+
>>> # Initializing a model from the configuration
|
|
195
|
+
>>> model = LasrForCTC(configuration)
|
|
196
|
+
>>> # Accessing the model configuration
|
|
197
|
+
>>> configuration = model.config
|
|
198
|
+
```
|
|
199
|
+
This configuration class is based on the Lasr CTC architecture from Google Health AI. You can find more details
|
|
200
|
+
and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO).
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
model_type = "lasr_ctc"
|
|
204
|
+
sub_configs = {"encoder_config": LasrEncoderConfig}
|
|
205
|
+
|
|
206
|
+
def __init__(
|
|
207
|
+
self,
|
|
208
|
+
vocab_size=512,
|
|
209
|
+
ctc_loss_reduction="mean",
|
|
210
|
+
ctc_zero_infinity=True,
|
|
211
|
+
encoder_config: Union[dict, LasrEncoderConfig] = None,
|
|
212
|
+
pad_token_id=0,
|
|
213
|
+
**kwargs,
|
|
214
|
+
):
|
|
215
|
+
self.vocab_size = vocab_size
|
|
216
|
+
self.ctc_loss_reduction = ctc_loss_reduction
|
|
217
|
+
self.ctc_zero_infinity = ctc_zero_infinity
|
|
218
|
+
|
|
219
|
+
if isinstance(encoder_config, dict):
|
|
220
|
+
self.encoder_config = LasrEncoderConfig(**encoder_config)
|
|
221
|
+
elif encoder_config is None:
|
|
222
|
+
self.encoder_config = LasrEncoderConfig()
|
|
223
|
+
|
|
224
|
+
self.encoder_config = self.encoder_config
|
|
225
|
+
self.initializer_range = self.encoder_config.initializer_range
|
|
226
|
+
|
|
227
|
+
super().__init__(
|
|
228
|
+
pad_token_id=pad_token_id,
|
|
229
|
+
**kwargs,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
@classmethod
|
|
233
|
+
def from_encoder_config(cls, encoder_config: LasrEncoderConfig, **kwargs):
|
|
234
|
+
r"""
|
|
235
|
+
Instantiate a [`LasrCTCConfig`] (or a derived class) from lasr encoder model configuration.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
[`LasrCTCConfig`]: An instance of a configuration object
|
|
239
|
+
"""
|
|
240
|
+
|
|
241
|
+
return cls(encoder_config=encoder_config.to_dict(), **kwargs)
|
|
242
|
+
|
|
243
|
+
@property
|
|
244
|
+
def inputs_to_logits_ratio(self):
|
|
245
|
+
return self.encoder_config.subsampling_conv_stride**2
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
__all__ = ["LasrEncoderConfig", "LasrCTCConfig"]
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 The HuggingFace Inc. team and Google LLC. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
from typing import Optional, Union
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import torch
|
|
19
|
+
|
|
20
|
+
from ...audio_utils import hertz_to_mel
|
|
21
|
+
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
|
|
22
|
+
from ...feature_extraction_utils import BatchFeature
|
|
23
|
+
from ...utils import TensorType, logging
|
|
24
|
+
from ...utils.import_utils import requires
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
logger = logging.get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# TODO: @eustlb, we should be able to remove this and use mel_filter_bank from audio_utils
|
|
31
|
+
def linear_to_mel_weight_matrix(
|
|
32
|
+
num_mel_bins: int,
|
|
33
|
+
num_spectrogram_bins: int,
|
|
34
|
+
sample_rate: float,
|
|
35
|
+
lower_edge_hertz: float,
|
|
36
|
+
upper_edge_hertz: float,
|
|
37
|
+
dtype,
|
|
38
|
+
) -> np.ndarray:
|
|
39
|
+
"""NumPy-port of the JAX mel weight matrix logic."""
|
|
40
|
+
# We use float64 for precision, matching the JAX implementation.
|
|
41
|
+
internal_dtype = np.float64
|
|
42
|
+
|
|
43
|
+
# HTK excludes the spectrogram DC bin.
|
|
44
|
+
bands_to_zero = 1
|
|
45
|
+
nyquist_hertz = sample_rate / 2.0
|
|
46
|
+
linear_frequencies = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins, dtype=internal_dtype)[bands_to_zero:]
|
|
47
|
+
spectrogram_bins_mel = hertz_to_mel(linear_frequencies, mel_scale="kaldi")[:, np.newaxis]
|
|
48
|
+
|
|
49
|
+
edges = np.linspace(
|
|
50
|
+
hertz_to_mel(lower_edge_hertz, mel_scale="kaldi"),
|
|
51
|
+
hertz_to_mel(upper_edge_hertz, mel_scale="kaldi"),
|
|
52
|
+
num_mel_bins + 2,
|
|
53
|
+
dtype=internal_dtype,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
lower_edge_mel, center_mel, upper_edge_mel = (
|
|
57
|
+
edges[:-2][np.newaxis, :],
|
|
58
|
+
edges[1:-1][np.newaxis, :],
|
|
59
|
+
edges[2:][np.newaxis, :],
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
lower_slopes = (spectrogram_bins_mel - lower_edge_mel) / (center_mel - lower_edge_mel)
|
|
63
|
+
upper_slopes = (upper_edge_mel - spectrogram_bins_mel) / (upper_edge_mel - center_mel)
|
|
64
|
+
mel_weights_matrix = np.maximum(0.0, np.minimum(lower_slopes, upper_slopes))
|
|
65
|
+
return np.pad(mel_weights_matrix, [[bands_to_zero, 0], [0, 0]]).astype(dtype)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@requires(backends=("torch",))
|
|
69
|
+
class LasrFeatureExtractor(SequenceFeatureExtractor):
|
|
70
|
+
r"""
|
|
71
|
+
Constructs a LASR feature extractor.
|
|
72
|
+
|
|
73
|
+
This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
|
|
74
|
+
most of the main methods. Users should refer to this superclass for more information regarding those methods.
|
|
75
|
+
|
|
76
|
+
This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
|
|
77
|
+
Fourier Transform` which should match pytorch's `torch.stft` equivalent.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
feature_size (`int`, *optional*, defaults to 128):
|
|
81
|
+
The feature dimension of the extracted features.
|
|
82
|
+
sampling_rate (`int`, *optional*, defaults to 16000):
|
|
83
|
+
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
|
|
84
|
+
hop_length (`int`, *optional*, defaults to 160):
|
|
85
|
+
Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
|
|
86
|
+
n_fft (`int`, *optional*, defaults to 512):
|
|
87
|
+
Size of the Fourier transform.
|
|
88
|
+
win_length (`int`, *optional*, defaults to 400):
|
|
89
|
+
The window length for the STFT computation.
|
|
90
|
+
padding_value (`float`, *optional*, defaults to 0.0):
|
|
91
|
+
Padding value used to pad the audio. Should correspond to silences.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
model_input_names = ["input_features", "attention_mask"]
|
|
95
|
+
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
feature_size=128,
|
|
99
|
+
sampling_rate=16000,
|
|
100
|
+
hop_length=160,
|
|
101
|
+
n_fft=512,
|
|
102
|
+
win_length=400,
|
|
103
|
+
padding_value=0.0,
|
|
104
|
+
**kwargs,
|
|
105
|
+
):
|
|
106
|
+
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
|
|
107
|
+
|
|
108
|
+
self.hop_length = hop_length
|
|
109
|
+
self.n_fft = n_fft
|
|
110
|
+
self.win_length = win_length
|
|
111
|
+
self.mel_filters = torch.from_numpy(
|
|
112
|
+
linear_to_mel_weight_matrix(
|
|
113
|
+
num_mel_bins=feature_size,
|
|
114
|
+
num_spectrogram_bins=n_fft // 2 + 1,
|
|
115
|
+
sample_rate=sampling_rate,
|
|
116
|
+
lower_edge_hertz=125.0,
|
|
117
|
+
upper_edge_hertz=7500.0,
|
|
118
|
+
dtype=np.float64,
|
|
119
|
+
)
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def _torch_extract_fbank_features(self, waveform, device="cpu"):
|
|
123
|
+
# spectrogram
|
|
124
|
+
window = torch.hann_window(self.win_length, periodic=False, device=device, dtype=torch.float64)
|
|
125
|
+
waveform = waveform.to(torch.float64)
|
|
126
|
+
|
|
127
|
+
# TODO: @eustlb, to be standardized
|
|
128
|
+
# here we cannot use directly torch.stft because every fft frame is padded with zeros
|
|
129
|
+
# due to unfold then rfft, while torch.stft unfolds with the number of fft points
|
|
130
|
+
frames = waveform.unfold(-1, self.win_length, self.hop_length)
|
|
131
|
+
stft = torch.fft.rfft(window * frames, n=self.n_fft)
|
|
132
|
+
power_spec = torch.abs(stft) ** 2
|
|
133
|
+
|
|
134
|
+
# log mel spectrogram
|
|
135
|
+
mel_filters = self.mel_filters.to(device)
|
|
136
|
+
mel_spec = torch.clamp(power_spec @ mel_filters, min=1e-5)
|
|
137
|
+
mel_spec = torch.log(mel_spec)
|
|
138
|
+
|
|
139
|
+
return mel_spec
|
|
140
|
+
|
|
141
|
+
def __call__(
|
|
142
|
+
self,
|
|
143
|
+
raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
|
|
144
|
+
truncation: bool = False,
|
|
145
|
+
pad_to_multiple_of: Optional[int] = None,
|
|
146
|
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
147
|
+
return_attention_mask: Optional[bool] = None,
|
|
148
|
+
padding: Optional[str] = "longest",
|
|
149
|
+
max_length: Optional[int] = None,
|
|
150
|
+
sampling_rate: Optional[int] = None,
|
|
151
|
+
do_normalize: Optional[bool] = None,
|
|
152
|
+
device: Optional[str] = "cpu",
|
|
153
|
+
return_token_timestamps: Optional[bool] = None,
|
|
154
|
+
**kwargs,
|
|
155
|
+
) -> BatchFeature:
|
|
156
|
+
"""
|
|
157
|
+
Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
|
|
158
|
+
the STFT computation if available, otherwise a slower NumPy based one.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
|
|
162
|
+
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
|
|
163
|
+
values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
|
|
164
|
+
stereo, i.e. single float per timestep.
|
|
165
|
+
truncation (`bool`, *optional*, default to `True`):
|
|
166
|
+
Activates truncation to cut input sequences longer than *max_length* to *max_length*.
|
|
167
|
+
pad_to_multiple_of (`int`, *optional*, defaults to None):
|
|
168
|
+
If set will pad the sequence to a multiple of the provided value.
|
|
169
|
+
|
|
170
|
+
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
|
171
|
+
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
|
|
172
|
+
return_attention_mask (`bool`, *optional*):
|
|
173
|
+
Whether to return the attention mask. If left to the default, will return the attention mask according
|
|
174
|
+
to the specific feature_extractor's default.
|
|
175
|
+
|
|
176
|
+
[What are attention masks?](../glossary#attention-mask)
|
|
177
|
+
|
|
178
|
+
<Tip>
|
|
179
|
+
|
|
180
|
+
For Parakeet models, `attention_mask` should always be passed for batched inference, to avoid subtle
|
|
181
|
+
bugs.
|
|
182
|
+
|
|
183
|
+
</Tip>
|
|
184
|
+
|
|
185
|
+
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
186
|
+
If set, will return tensors instead of list of python integers. Acceptable values are:
|
|
187
|
+
|
|
188
|
+
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
|
189
|
+
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
|
190
|
+
- `'np'`: Return Numpy `np.ndarray` objects.
|
|
191
|
+
sampling_rate (`int`, *optional*):
|
|
192
|
+
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
|
|
193
|
+
`sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
|
|
194
|
+
pipeline.
|
|
195
|
+
padding_value (`float`, *optional*, defaults to 0.0):
|
|
196
|
+
The value that is used to fill the padding values / vectors.
|
|
197
|
+
do_normalize (`bool`, *optional*, defaults to `False`):
|
|
198
|
+
Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
|
|
199
|
+
improve the performance of the model.
|
|
200
|
+
device (`str`, *optional*, defaults to `'cpu'`):
|
|
201
|
+
Specifies the device for computation of the log-mel spectrogram of audio signals in the
|
|
202
|
+
`_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
|
|
203
|
+
return_token_timestamps (`bool`, *optional*, defaults to `None`):
|
|
204
|
+
Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.
|
|
205
|
+
|
|
206
|
+
Whether or not to return the number of frames of the input raw_speech.
|
|
207
|
+
These num_frames can be used by the model to compute word level timestamps.
|
|
208
|
+
"""
|
|
209
|
+
if sampling_rate is not None:
|
|
210
|
+
if sampling_rate != self.sampling_rate:
|
|
211
|
+
raise ValueError(
|
|
212
|
+
f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
|
|
213
|
+
f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
|
|
214
|
+
f" was sampled with {self.sampling_rate} and not {sampling_rate}."
|
|
215
|
+
)
|
|
216
|
+
else:
|
|
217
|
+
logger.warning(
|
|
218
|
+
f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
|
|
219
|
+
"Failing to do so can result in silent errors that might be hard to debug."
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Convert to torch tensor
|
|
223
|
+
if isinstance(raw_speech, np.ndarray):
|
|
224
|
+
raw_speech = torch.tensor(raw_speech)
|
|
225
|
+
elif isinstance(raw_speech, (list, tuple)):
|
|
226
|
+
if isinstance(raw_speech[0], (list, np.ndarray)):
|
|
227
|
+
raw_speech = [torch.tensor(speech) for speech in raw_speech]
|
|
228
|
+
else: # list[float]
|
|
229
|
+
raw_speech = torch.tensor(raw_speech)
|
|
230
|
+
|
|
231
|
+
is_batched_torch = isinstance(raw_speech, torch.Tensor) and len(raw_speech.shape) > 1
|
|
232
|
+
if is_batched_torch and len(raw_speech.shape) > 2:
|
|
233
|
+
logger.warning(
|
|
234
|
+
f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
|
|
235
|
+
"We will take the mean of the channels to convert to mono."
|
|
236
|
+
)
|
|
237
|
+
raw_speech = raw_speech.mean(-1)
|
|
238
|
+
|
|
239
|
+
is_batched_sequence = isinstance(raw_speech, (list, tuple))
|
|
240
|
+
if is_batched_sequence:
|
|
241
|
+
for speech in raw_speech:
|
|
242
|
+
if len(speech.shape) > 1:
|
|
243
|
+
logger.warning(
|
|
244
|
+
f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
|
|
245
|
+
"We will take the mean of the channels to convert to mono."
|
|
246
|
+
)
|
|
247
|
+
speech = speech.mean(-1)
|
|
248
|
+
|
|
249
|
+
if is_batched_torch or is_batched_sequence:
|
|
250
|
+
raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
|
|
251
|
+
else:
|
|
252
|
+
raw_speech = [raw_speech[:, None].to(torch.float32)]
|
|
253
|
+
|
|
254
|
+
batched_speech = BatchFeature({"input_features": raw_speech})
|
|
255
|
+
padded_inputs = self.pad(
|
|
256
|
+
batched_speech,
|
|
257
|
+
padding=padding,
|
|
258
|
+
max_length=max_length,
|
|
259
|
+
truncation=truncation,
|
|
260
|
+
pad_to_multiple_of=pad_to_multiple_of,
|
|
261
|
+
return_attention_mask=return_attention_mask,
|
|
262
|
+
return_tensors="pt",
|
|
263
|
+
)
|
|
264
|
+
input_features = padded_inputs.input_features.squeeze(-1)
|
|
265
|
+
input_features = self._torch_extract_fbank_features(input_features, device)
|
|
266
|
+
data = {
|
|
267
|
+
"input_features": input_features.to(torch.float32),
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
if return_attention_mask:
|
|
271
|
+
attention_mask = padded_inputs.attention_mask[:, self.win_length - 1 :: self.hop_length]
|
|
272
|
+
data["attention_mask"] = attention_mask.to(torch.bool)
|
|
273
|
+
|
|
274
|
+
return BatchFeature(data=data, tensor_type=return_tensors)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
__all__ = ["LasrFeatureExtractor"]
|