transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -25,6 +25,7 @@ import torch
|
|
|
25
25
|
import torch.nn.functional as F
|
|
26
26
|
from torch import nn
|
|
27
27
|
|
|
28
|
+
from ... import initialization as init
|
|
28
29
|
from ...activations import GELUActivation
|
|
29
30
|
from ...cache_utils import Cache, DynamicCache
|
|
30
31
|
from ...image_processing_utils import BatchFeature
|
|
@@ -776,6 +777,14 @@ class PaddleOCRVLPreTrainedModel(PreTrainedModel):
|
|
|
776
777
|
"attentions": PaddleOCRAttention,
|
|
777
778
|
}
|
|
778
779
|
|
|
780
|
+
def _init_weights(self, module):
|
|
781
|
+
super()._init_weights(module)
|
|
782
|
+
if isinstance(module, PaddleOCRVisionEmbeddings):
|
|
783
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
784
|
+
elif isinstance(module, PaddleOCRVisionRotaryEmbedding):
|
|
785
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
786
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
787
|
+
|
|
779
788
|
|
|
780
789
|
class PaddleOCRTextModel(PaddleOCRVLPreTrainedModel, Ernie4_5Model):
|
|
781
790
|
def __init__(self, config: PaddleOCRTextConfig):
|
|
@@ -977,18 +986,17 @@ class PaddleOCRVisionEncoder(VideoLlama3VisionEncoder):
|
|
|
977
986
|
attention_mask: Optional[torch.Tensor] = None,
|
|
978
987
|
image_grid_thw: Optional[list[Union[tuple[int, int, int], list[tuple[int, int, int]]]]] = None,
|
|
979
988
|
) -> BaseModelOutput:
|
|
980
|
-
"""
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
attention_mask
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
The temporal, height and width of feature shape of each image in LLM.
|
|
989
|
+
r"""
|
|
990
|
+
inputs_embeds (`torch.FloatTensor` of shape `(sequence_length, hidden_size)`, *optional*):
|
|
991
|
+
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
|
992
|
+
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
|
993
|
+
than the model's internal embedding lookup matrix.
|
|
994
|
+
cu_seqlens (`torch.Tensor` of shape `(num_images + 1,)`):
|
|
995
|
+
The cumulative sequence lengths of each image or video feature.
|
|
996
|
+
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
997
|
+
The attention_mask used in forward function shape [batch_size X sequence_length] if not None.
|
|
998
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
999
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
992
1000
|
"""
|
|
993
1001
|
device = inputs_embeds.device
|
|
994
1002
|
hidden_states = inputs_embeds
|
|
@@ -1037,6 +1045,8 @@ class PaddleOCRVisionTransformer(PaddleOCRVLPreTrainedModel):
|
|
|
1037
1045
|
self.encoder = PaddleOCRVisionEncoder(config)
|
|
1038
1046
|
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
|
1039
1047
|
|
|
1048
|
+
self.post_init()
|
|
1049
|
+
|
|
1040
1050
|
def forward(
|
|
1041
1051
|
self,
|
|
1042
1052
|
pixel_values: torch.FloatTensor,
|
|
@@ -149,7 +149,8 @@ def create_causal_mask_mapping(
|
|
|
149
149
|
position_ids: Optional[torch.Tensor],
|
|
150
150
|
token_type_ids: Optional[torch.Tensor] = None,
|
|
151
151
|
pixel_values: Optional[torch.FloatTensor] = None,
|
|
152
|
-
is_training: bool = False,
|
|
152
|
+
is_training: Optional[bool] = False,
|
|
153
|
+
is_first_iteration: Optional[bool] = None,
|
|
153
154
|
**kwargs,
|
|
154
155
|
) -> dict:
|
|
155
156
|
"""
|
|
@@ -169,31 +170,33 @@ def create_causal_mask_mapping(
|
|
|
169
170
|
"past_key_values": past_key_values,
|
|
170
171
|
"position_ids": position_ids,
|
|
171
172
|
}
|
|
172
|
-
#
|
|
173
|
-
#
|
|
174
|
-
#
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
173
|
+
# Infer if prefill or decoding stage, if the flag isn't passed. This happens only when the mask is constructed
|
|
174
|
+
# from `forward` call. If users run a `forward` call, we have no option to infer `is_first_iteration` because users may be
|
|
175
|
+
# running generation with custom loop. Thus we need to infer it in a `non-perfect` way
|
|
176
|
+
# NOTE: Determining prefill in that case requires checking data values, which is not compile-compatible.
|
|
177
|
+
is_first_iteration = (
|
|
178
|
+
is_first_iteration
|
|
179
|
+
if is_first_iteration
|
|
180
|
+
else (past_key_values is None or not past_key_values.is_initialized or pixel_values is not None)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
178
184
|
if token_type_ids is not None:
|
|
179
185
|
# The logic bellow was originally written for Gemma3, where `token_type_ids` is reversed. Let's reverse
|
|
180
186
|
# it to then use exactly the same logic.
|
|
181
187
|
token_type_ids = 1 - token_type_ids
|
|
182
188
|
else:
|
|
183
189
|
logger.warning_once(
|
|
184
|
-
"
|
|
190
|
+
"It is a prefill stage but The `token_type_ids` is not provided. We recommend "
|
|
185
191
|
"passing `token_type_ids` to the model to prevent bad attention masking."
|
|
186
192
|
)
|
|
187
|
-
# BC: when NOT training, use bidirectional mask if sequence length > 1. Otherwise, use the default causal
|
|
188
|
-
# mask. This is incorrect in some advanced use cases, hence the warning above.
|
|
189
193
|
# NOTE: this branch can't be reached when training because `token_type_ids` is required as a model input.
|
|
190
|
-
|
|
191
|
-
token_type_ids = torch.ones_like(input_embeds)[:, :, 0]
|
|
194
|
+
token_type_ids = torch.ones_like(input_embeds)[:, :, 0]
|
|
192
195
|
|
|
193
196
|
# Logic originally copied from Gemma3. It holds up for Paligemma as well because Paligemma assumes up to one image
|
|
194
197
|
# per prompt AND we reverse `token_type_ids` above. Gemma3 uses a bidirectional mask for images, tagged through
|
|
195
198
|
# `token_type_ids` 1s.
|
|
196
|
-
if token_type_ids is not None and
|
|
199
|
+
if token_type_ids is not None and is_first_iteration:
|
|
197
200
|
# We need to pass an additional mask function to account for token type ids, and it needs to be an `or` (to
|
|
198
201
|
# undo the causal masking)
|
|
199
202
|
|
|
@@ -550,6 +553,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
|
|
550
553
|
use_cache=True,
|
|
551
554
|
logits_to_keep=None,
|
|
552
555
|
labels=None,
|
|
556
|
+
is_first_iteration=False,
|
|
553
557
|
**kwargs,
|
|
554
558
|
):
|
|
555
559
|
# Overwritten -- custom `position_ids` and `pixel_values` handling
|
|
@@ -563,6 +567,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
|
|
563
567
|
use_cache=use_cache,
|
|
564
568
|
logits_to_keep=logits_to_keep,
|
|
565
569
|
token_type_ids=token_type_ids,
|
|
570
|
+
is_first_iteration=is_first_iteration,
|
|
566
571
|
**kwargs,
|
|
567
572
|
)
|
|
568
573
|
|
|
@@ -570,9 +575,11 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
|
|
570
575
|
if model_inputs.get("position_ids") is not None:
|
|
571
576
|
model_inputs["position_ids"] += 1
|
|
572
577
|
|
|
573
|
-
#
|
|
574
|
-
#
|
|
575
|
-
|
|
578
|
+
# Pixel values are used only in the first iteration if available
|
|
579
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
580
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
581
|
+
# iteration with a question and cached system prompt (continue generate from cache). NOTE: use_cache=False needs pixel_values always
|
|
582
|
+
if is_first_iteration or not use_cache:
|
|
576
583
|
model_inputs["pixel_values"] = pixel_values
|
|
577
584
|
|
|
578
585
|
return model_inputs
|
|
@@ -586,6 +593,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
|
|
586
593
|
past_key_values: Optional[Cache],
|
|
587
594
|
position_ids: Optional[torch.Tensor],
|
|
588
595
|
token_type_ids: Optional[torch.Tensor] = None,
|
|
596
|
+
is_first_iteration: Optional[bool] = False,
|
|
589
597
|
**kwargs,
|
|
590
598
|
) -> dict:
|
|
591
599
|
# Uses the overwritten `create_masks_for_generate` with `token_type_ids` masking
|
|
@@ -597,7 +605,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi
|
|
|
597
605
|
past_key_values,
|
|
598
606
|
position_ids,
|
|
599
607
|
token_type_ids,
|
|
600
|
-
|
|
608
|
+
is_first_iteration=is_first_iteration,
|
|
601
609
|
**{k: v for k, v in kwargs.items() if k != "pixel_values"},
|
|
602
610
|
)
|
|
603
611
|
|
|
@@ -510,6 +510,11 @@ class ParakeetPreTrainedModel(PreTrainedModel):
|
|
|
510
510
|
# Initialize positional bias parameters
|
|
511
511
|
init.normal_(module.bias_u, mean=0.0, std=std)
|
|
512
512
|
init.normal_(module.bias_v, mean=0.0, std=std)
|
|
513
|
+
elif isinstance(module, ParakeetEncoderRelPositionalEncoding):
|
|
514
|
+
inv_freq = 1.0 / (
|
|
515
|
+
10000.0 ** (torch.arange(0, self.config.hidden_size, 2, dtype=torch.int64) / self.config.hidden_size)
|
|
516
|
+
)
|
|
517
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
513
518
|
|
|
514
519
|
def _get_subsampling_output_length(self, input_lengths: torch.Tensor):
|
|
515
520
|
encoder_config = self.config.encoder_config if isinstance(self.config, ParakeetCTCConfig) else self.config
|
|
@@ -346,6 +346,11 @@ class ParakeetPreTrainedModel(PreTrainedModel):
|
|
|
346
346
|
# Initialize positional bias parameters
|
|
347
347
|
init.normal_(module.bias_u, mean=0.0, std=std)
|
|
348
348
|
init.normal_(module.bias_v, mean=0.0, std=std)
|
|
349
|
+
elif isinstance(module, ParakeetEncoderRelPositionalEncoding):
|
|
350
|
+
inv_freq = 1.0 / (
|
|
351
|
+
10000.0 ** (torch.arange(0, self.config.hidden_size, 2, dtype=torch.int64) / self.config.hidden_size)
|
|
352
|
+
)
|
|
353
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
349
354
|
|
|
350
355
|
def _get_subsampling_output_length(self, input_lengths: torch.Tensor):
|
|
351
356
|
encoder_config = self.config.encoder_config if isinstance(self.config, ParakeetCTCConfig) else self.config
|
|
@@ -16,10 +16,10 @@
|
|
|
16
16
|
import itertools
|
|
17
17
|
from typing import Optional, Union
|
|
18
18
|
|
|
19
|
-
from ...tokenization_utils_tokenizers import
|
|
19
|
+
from ...tokenization_utils_tokenizers import TokenizersBackend
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
class
|
|
22
|
+
class ParakeetTokenizer(TokenizersBackend):
|
|
23
23
|
"""
|
|
24
24
|
Inherits all methods from [`PreTrainedTokenizerFast`]. Users should refer to this superclass for more information regarding those methods,
|
|
25
25
|
except for `_decode` which is overridden to adapt it to CTC decoding:
|
|
@@ -51,4 +51,4 @@ class ParakeetTokenizerFast(PreTrainedTokenizerFast):
|
|
|
51
51
|
)
|
|
52
52
|
|
|
53
53
|
|
|
54
|
-
__all__ = ["
|
|
54
|
+
__all__ = ["ParakeetTokenizer"]
|
|
@@ -696,6 +696,10 @@ class PatchTSMixerPreTrainedModel(PreTrainedModel):
|
|
|
696
696
|
elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
|
|
697
697
|
init.zeros_(module.bias)
|
|
698
698
|
init.ones_(module.weight)
|
|
699
|
+
if getattr(module, "running_mean", None) is not None:
|
|
700
|
+
init.zeros_(module.running_mean)
|
|
701
|
+
init.ones_(module.running_var)
|
|
702
|
+
init.zeros_(module.num_batches_tracked)
|
|
699
703
|
elif isinstance(module, PatchTSMixerBatchNorm):
|
|
700
704
|
init.zeros_(module.batchnorm.bias)
|
|
701
705
|
init.ones_(module.batchnorm.weight)
|
|
@@ -584,12 +584,13 @@ class PatchTSTPreTrainedModel(PreTrainedModel):
|
|
|
584
584
|
init.copy_(module.position_enc, position_enc)
|
|
585
585
|
else:
|
|
586
586
|
init.copy_(module.position_enc, position_enc)
|
|
587
|
-
elif isinstance(module, nn.LayerNorm):
|
|
587
|
+
elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
|
|
588
588
|
init.zeros_(module.bias)
|
|
589
589
|
init.ones_(module.weight)
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
590
|
+
if getattr(module, "running_mean", None) is not None:
|
|
591
|
+
init.zeros_(module.running_mean)
|
|
592
|
+
init.ones_(module.running_var)
|
|
593
|
+
init.zeros_(module.num_batches_tracked)
|
|
593
594
|
elif isinstance(module, nn.Linear):
|
|
594
595
|
init.normal_(module.weight, mean=0.0, std=self.config.init_std)
|
|
595
596
|
if module.bias is not None:
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
from typing import TYPE_CHECKING
|
|
16
|
+
|
|
17
|
+
from ...utils import _LazyModule
|
|
18
|
+
from ...utils.import_utils import define_import_structure
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from .configuration_pe_audio import *
|
|
23
|
+
from .feature_extraction_pe_audio import *
|
|
24
|
+
from .modeling_pe_audio import *
|
|
25
|
+
from .processing_pe_audio import *
|
|
26
|
+
else:
|
|
27
|
+
import sys
|
|
28
|
+
|
|
29
|
+
_file = globals()["__file__"]
|
|
30
|
+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
from typing import Optional, Union
|
|
16
|
+
|
|
17
|
+
from ...configuration_utils import PreTrainedConfig, PretrainedConfig
|
|
18
|
+
from ...modeling_rope_utils import RopeParameters
|
|
19
|
+
from ...utils import logging
|
|
20
|
+
from ..auto import CONFIG_MAPPING, AutoConfig
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
logger = logging.get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PeAudioEncoderConfig(PreTrainedConfig):
|
|
27
|
+
r"""
|
|
28
|
+
This is the configuration class to store the configuration of a [`PeAudioEncoder`]. It is used to instantiate a
|
|
29
|
+
PeAudioEncoder model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
30
|
+
with the defaults will yield a similar configuration to that of pe-av-large.
|
|
31
|
+
e.g. [facebook/pe-av-large](https://huggingface.co/facebook/pe-av-large)
|
|
32
|
+
|
|
33
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
34
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
dac_config (`Union[PreTrainedConfig, dict]`, *optional*):
|
|
39
|
+
Configuration for the DAC audio encoder used to tokenize the raw audio inputs. If a dictionary is passed, it
|
|
40
|
+
will be used to instantiate a [`~transformers.DacConfig`] with default DAC hyperparameters.
|
|
41
|
+
hidden_size (`int`, *optional*, defaults to 1792):
|
|
42
|
+
Dimension of the hidden representations.
|
|
43
|
+
intermediate_size (`int`, *optional*, defaults to 4800):
|
|
44
|
+
Dimension of the feedforward layers in the Transformer blocks.
|
|
45
|
+
num_hidden_layers (`int`, *optional*, defaults to 6):
|
|
46
|
+
Number of Transformer encoder blocks.
|
|
47
|
+
num_attention_heads (`int`, *optional*, defaults to 14):
|
|
48
|
+
Number of attention heads used in each attention layer.
|
|
49
|
+
num_key_value_heads (`int`, *optional*):
|
|
50
|
+
Number of key and value heads for grouped-query attention. If unset, this defaults to `num_attention_heads`.
|
|
51
|
+
head_dim (`int`, *optional*, defaults to 128):
|
|
52
|
+
Dimension of each attention head for query, key, and value projections.
|
|
53
|
+
hidden_act (`str`, *optional*, defaults to `"silu"`):
|
|
54
|
+
The non-linear activation function (function or string) in the Transformer blocks.
|
|
55
|
+
max_position_embeddings (`int`, *optional*, defaults to 10000):
|
|
56
|
+
Maximum sequence length supported by the rotary position embeddings.
|
|
57
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
58
|
+
Standard deviation of the truncated normal initializer for weight matrices.
|
|
59
|
+
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
|
60
|
+
Epsilon used by the RMS normalization layers.
|
|
61
|
+
rope_parameters (`Union[RopeParameters, dict]`, *optional*, defaults to `{'rope_theta': 20000}`):
|
|
62
|
+
Parameters for the rotary position embeddings, such as the base `rope_theta`.
|
|
63
|
+
attention_bias (`bool`, *optional*, defaults to `False`):
|
|
64
|
+
Whether to use bias terms in the query, key, value, and output projections.
|
|
65
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
66
|
+
Dropout ratio applied to attention probabilities.
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
>>> from transformers import PeAudioEncoder, PeAudioEncoderConfig
|
|
70
|
+
|
|
71
|
+
>>> # Initializing a PeAudioEncoder style configuration
|
|
72
|
+
>>> configuration = PeAudioEncoderConfig()
|
|
73
|
+
|
|
74
|
+
>>> # Initializing a model from the pe-av-large style configuration
|
|
75
|
+
>>> model = PeAudioEncoder(configuration)
|
|
76
|
+
|
|
77
|
+
>>> # Accessing the model configuration
|
|
78
|
+
>>> configuration = model.config
|
|
79
|
+
```"""
|
|
80
|
+
|
|
81
|
+
model_type = "pe_audio_encoder"
|
|
82
|
+
sub_configs = {"dac_config": AutoConfig}
|
|
83
|
+
base_config_key = "audio_video_config"
|
|
84
|
+
|
|
85
|
+
_default_dac_config_kwargs = {
|
|
86
|
+
"downsampling_ratios": [2, 8, 10, 12],
|
|
87
|
+
"encoder_hidden_size": 64,
|
|
88
|
+
"codebook_dim": 128,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
dac_config: Optional[Union[dict, PreTrainedConfig]] = None,
|
|
94
|
+
hidden_size: Optional[int] = 1792,
|
|
95
|
+
intermediate_size: Optional[int] = 4800,
|
|
96
|
+
num_hidden_layers: Optional[int] = 6,
|
|
97
|
+
num_attention_heads: Optional[int] = 14,
|
|
98
|
+
num_key_value_heads: Optional[int] = None,
|
|
99
|
+
head_dim: Optional[int] = 128,
|
|
100
|
+
hidden_act: Optional[str] = "silu",
|
|
101
|
+
max_position_embeddings: Optional[int] = 10000,
|
|
102
|
+
initializer_range: Optional[float] = 0.02,
|
|
103
|
+
rms_norm_eps: Optional[float] = 1e-5,
|
|
104
|
+
rope_parameters: Optional[Union[RopeParameters, dict]] = {"rope_theta": 20000},
|
|
105
|
+
attention_bias: Optional[bool] = False,
|
|
106
|
+
attention_dropout: Optional[float] = 0.0,
|
|
107
|
+
**kwargs,
|
|
108
|
+
):
|
|
109
|
+
self.hidden_size = hidden_size
|
|
110
|
+
self.intermediate_size = intermediate_size
|
|
111
|
+
self.num_hidden_layers = num_hidden_layers
|
|
112
|
+
self.num_attention_heads = num_attention_heads
|
|
113
|
+
|
|
114
|
+
# for backward compatibility
|
|
115
|
+
if num_key_value_heads is None:
|
|
116
|
+
num_key_value_heads = num_attention_heads
|
|
117
|
+
|
|
118
|
+
self.num_key_value_heads = num_key_value_heads
|
|
119
|
+
self.head_dim = head_dim
|
|
120
|
+
self.hidden_act = hidden_act
|
|
121
|
+
self.max_position_embeddings = max_position_embeddings
|
|
122
|
+
self.initializer_range = initializer_range
|
|
123
|
+
self.rms_norm_eps = rms_norm_eps
|
|
124
|
+
self.rope_parameters = rope_parameters
|
|
125
|
+
self.attention_bias = attention_bias
|
|
126
|
+
self.attention_dropout = attention_dropout
|
|
127
|
+
|
|
128
|
+
if isinstance(dac_config, dict):
|
|
129
|
+
dac_config["model_type"] = dac_config.get("model_type", "dac")
|
|
130
|
+
dac_config = CONFIG_MAPPING[dac_config["model_type"]](**{**self._default_dac_config_kwargs, **dac_config})
|
|
131
|
+
elif dac_config is None:
|
|
132
|
+
dac_config = CONFIG_MAPPING["dac"](**self._default_dac_config_kwargs)
|
|
133
|
+
|
|
134
|
+
self.dac_config = dac_config
|
|
135
|
+
|
|
136
|
+
super().__init__(**kwargs)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class PeAudioConfig(PretrainedConfig):
|
|
140
|
+
r"""
|
|
141
|
+
This is the configuration class to store the configuration of a [`PeAudioModel`]. It is used to instantiate a
|
|
142
|
+
PeAudioModel model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
143
|
+
with the defaults will yield a similar configuration to that of pe-av-large.
|
|
144
|
+
e.g. [facebook/pe-av-large](https://huggingface.co/facebook/pe-av-large)
|
|
145
|
+
|
|
146
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
147
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
text_config (`dict` or `PreTrainedConfig`, *optional*):
|
|
152
|
+
Configuration for the text model component.
|
|
153
|
+
audio_config (`dict` or `PreTrainedConfig`, *optional*):
|
|
154
|
+
Configuration for the audio encoder component.
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
>>> from transformers import PeAudioModel, PeAudioConfig
|
|
158
|
+
|
|
159
|
+
>>> # Initializing a PeAudioModel style configuration
|
|
160
|
+
>>> configuration = PeAudioConfig()
|
|
161
|
+
|
|
162
|
+
>>> # Initializing a model from the pe-av-large style configuration
|
|
163
|
+
>>> model = PeAudioModel(configuration)
|
|
164
|
+
|
|
165
|
+
>>> # Accessing the model configuration
|
|
166
|
+
>>> configuration = model.config
|
|
167
|
+
```"""
|
|
168
|
+
|
|
169
|
+
model_type = "pe_audio"
|
|
170
|
+
sub_configs = {"text_config": AutoConfig, "audio_config": PeAudioEncoderConfig}
|
|
171
|
+
base_config_key = "audio_video_config"
|
|
172
|
+
|
|
173
|
+
_default_text_config_kwargs = {
|
|
174
|
+
"model_type": "modernbert",
|
|
175
|
+
"hidden_size": 1024,
|
|
176
|
+
"intermediate_size": 2624,
|
|
177
|
+
"num_hidden_layers": 22,
|
|
178
|
+
"num_attention_heads": 16,
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
def __init__(
|
|
182
|
+
self,
|
|
183
|
+
text_config=None,
|
|
184
|
+
audio_config=None,
|
|
185
|
+
**kwargs,
|
|
186
|
+
):
|
|
187
|
+
if isinstance(text_config, dict):
|
|
188
|
+
text_config["model_type"] = text_config.get("model_type", "modernbert")
|
|
189
|
+
text_config = CONFIG_MAPPING[text_config["model_type"]](
|
|
190
|
+
**{**self._default_text_config_kwargs, **text_config}
|
|
191
|
+
)
|
|
192
|
+
elif text_config is None:
|
|
193
|
+
text_config = CONFIG_MAPPING["modernbert"](**self._default_text_config_kwargs)
|
|
194
|
+
|
|
195
|
+
if isinstance(audio_config, dict):
|
|
196
|
+
audio_config = PeAudioEncoderConfig(**audio_config)
|
|
197
|
+
elif audio_config is None:
|
|
198
|
+
audio_config = PeAudioEncoderConfig()
|
|
199
|
+
|
|
200
|
+
self.text_config = text_config
|
|
201
|
+
self.audio_config = audio_config
|
|
202
|
+
|
|
203
|
+
super().__init__(**kwargs)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
__all__ = ["PeAudioEncoderConfig", "PeAudioConfig"]
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
from typing import Optional, Union
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
|
|
20
|
+
from ...feature_extraction_utils import BatchFeature
|
|
21
|
+
from ...processing_utils import load_audio
|
|
22
|
+
from ...utils import PaddingStrategy, TensorType, logging
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
logger = logging.get_logger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PeAudioFeatureExtractor(SequenceFeatureExtractor):
|
|
29
|
+
r"""
|
|
30
|
+
Constructs a PeAudioFeatureExtractor feature extractor.
|
|
31
|
+
|
|
32
|
+
This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
|
|
33
|
+
most of the main methods. Users should refer to this superclass for more information regarding those methods.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
feature_size (`int`, *optional*, defaults to 1):
|
|
37
|
+
The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
|
|
38
|
+
sampling_rate (`int`, *optional*, defaults to 48000):
|
|
39
|
+
The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
|
|
40
|
+
padding_value (`float`, *optional*, defaults to 0.0):
|
|
41
|
+
The value that is used for padding.
|
|
42
|
+
hop_length (`int`, *optional*, defaults to 1920):
|
|
43
|
+
Overlap length between successive windows.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
model_input_names = ["input_values"]
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
feature_size: int = 1,
|
|
51
|
+
sampling_rate: int = 48_000,
|
|
52
|
+
padding_value: float = 0.0,
|
|
53
|
+
hop_length: int = 1920,
|
|
54
|
+
**kwargs,
|
|
55
|
+
):
|
|
56
|
+
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
|
|
57
|
+
self.hop_length = hop_length
|
|
58
|
+
|
|
59
|
+
def _reflect_pad(self, wav):
|
|
60
|
+
if len(wav) % self.hop_length == 0:
|
|
61
|
+
return wav
|
|
62
|
+
p1d = (0, self.hop_length - (len(wav) % self.hop_length))
|
|
63
|
+
return np.pad(wav, p1d, "reflect")
|
|
64
|
+
|
|
65
|
+
def __call__(
|
|
66
|
+
self,
|
|
67
|
+
raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]], str, list[str]],
|
|
68
|
+
padding: Optional[Union[bool, str, PaddingStrategy]] = None,
|
|
69
|
+
truncation: Optional[bool] = False,
|
|
70
|
+
max_length: Optional[int] = None,
|
|
71
|
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
72
|
+
sampling_rate: Optional[int] = None,
|
|
73
|
+
) -> BatchFeature:
|
|
74
|
+
from_file = False
|
|
75
|
+
if isinstance(raw_audio, str):
|
|
76
|
+
raw_audio = [raw_audio]
|
|
77
|
+
|
|
78
|
+
if isinstance(raw_audio, (list, tuple)) and isinstance(raw_audio[0], str):
|
|
79
|
+
loaded = []
|
|
80
|
+
for audio_file in raw_audio:
|
|
81
|
+
loaded.append(load_audio(audio_file, self.sampling_rate))
|
|
82
|
+
raw_audio = loaded
|
|
83
|
+
from_file = True
|
|
84
|
+
|
|
85
|
+
if sampling_rate is not None:
|
|
86
|
+
if sampling_rate != self.sampling_rate:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
|
|
89
|
+
f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
|
|
90
|
+
f" {self.sampling_rate} and not {sampling_rate}."
|
|
91
|
+
)
|
|
92
|
+
elif not from_file:
|
|
93
|
+
logger.warning(
|
|
94
|
+
f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
|
|
95
|
+
"Failing to do so can result in silent errors that might be hard to debug."
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if padding and truncation:
|
|
99
|
+
raise ValueError("Both padding and truncation were set. Make sure you only set one.")
|
|
100
|
+
elif padding is None:
|
|
101
|
+
# by default let's pad the inputs
|
|
102
|
+
padding = True
|
|
103
|
+
|
|
104
|
+
is_batched = bool(
|
|
105
|
+
isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if is_batched:
|
|
109
|
+
raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
|
|
110
|
+
elif not is_batched and not isinstance(raw_audio, np.ndarray):
|
|
111
|
+
raw_audio = np.asarray(raw_audio, dtype=np.float32)
|
|
112
|
+
elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
|
|
113
|
+
raw_audio = raw_audio.astype(np.float32)
|
|
114
|
+
|
|
115
|
+
# always return batch
|
|
116
|
+
if not is_batched:
|
|
117
|
+
raw_audio = [np.asarray(raw_audio).T]
|
|
118
|
+
|
|
119
|
+
if isinstance(raw_audio, list):
|
|
120
|
+
raw_audio = [self._reflect_pad(x) for x in raw_audio]
|
|
121
|
+
else:
|
|
122
|
+
raw_audio = self._reflect_pad(raw_audio)
|
|
123
|
+
|
|
124
|
+
# verify inputs are valid
|
|
125
|
+
for example in raw_audio:
|
|
126
|
+
if example.ndim > 2:
|
|
127
|
+
raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
|
|
128
|
+
if self.feature_size == 1 and example.ndim != 1:
|
|
129
|
+
raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
|
|
130
|
+
if self.feature_size == 2:
|
|
131
|
+
raise ValueError("Stereo audio isn't supported for now")
|
|
132
|
+
|
|
133
|
+
input_values = BatchFeature({"input_values": raw_audio})
|
|
134
|
+
|
|
135
|
+
# normal padding on batch
|
|
136
|
+
padded_inputs = self.pad(
|
|
137
|
+
input_values,
|
|
138
|
+
max_length=max_length,
|
|
139
|
+
truncation=truncation,
|
|
140
|
+
padding=padding,
|
|
141
|
+
return_attention_mask=padding,
|
|
142
|
+
pad_to_multiple_of=self.hop_length,
|
|
143
|
+
)
|
|
144
|
+
if padding:
|
|
145
|
+
padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
|
|
146
|
+
if padding:
|
|
147
|
+
padded_inputs.input_values = padded_inputs.input_values[:, np.newaxis, :]
|
|
148
|
+
|
|
149
|
+
input_values = []
|
|
150
|
+
for example in padded_inputs.pop("input_values"):
|
|
151
|
+
if self.feature_size == 1:
|
|
152
|
+
example = example[..., None]
|
|
153
|
+
input_values.append(example.T)
|
|
154
|
+
|
|
155
|
+
padded_inputs["input_values"] = input_values
|
|
156
|
+
if return_tensors is not None:
|
|
157
|
+
padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
|
|
158
|
+
|
|
159
|
+
return padded_inputs
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
__all__ = ["PeAudioFeatureExtractor"]
|