transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
transformers/models/__init__.py
CHANGED
|
@@ -23,6 +23,7 @@ if TYPE_CHECKING:
|
|
|
23
23
|
from .albert import *
|
|
24
24
|
from .align import *
|
|
25
25
|
from .altclip import *
|
|
26
|
+
from .apertus import *
|
|
26
27
|
from .arcee import *
|
|
27
28
|
from .aria import *
|
|
28
29
|
from .audio_spectrogram_transformer import *
|
|
@@ -107,6 +108,7 @@ if TYPE_CHECKING:
|
|
|
107
108
|
from .dinov3_vit import *
|
|
108
109
|
from .distilbert import *
|
|
109
110
|
from .dit import *
|
|
111
|
+
from .doge import *
|
|
110
112
|
from .donut import *
|
|
111
113
|
from .dots1 import *
|
|
112
114
|
from .dpr import *
|
|
@@ -119,7 +121,11 @@ if TYPE_CHECKING:
|
|
|
119
121
|
from .emu3 import *
|
|
120
122
|
from .encodec import *
|
|
121
123
|
from .encoder_decoder import *
|
|
124
|
+
from .eomt import *
|
|
122
125
|
from .ernie import *
|
|
126
|
+
from .ernie4_5 import *
|
|
127
|
+
from .ernie4_5_moe import *
|
|
128
|
+
from .ernie4_5_vl_moe import *
|
|
123
129
|
from .esm import *
|
|
124
130
|
from .evolla import *
|
|
125
131
|
from .exaone4 import *
|
|
@@ -144,9 +150,11 @@ if TYPE_CHECKING:
|
|
|
144
150
|
from .git import *
|
|
145
151
|
from .glm import *
|
|
146
152
|
from .glm4 import *
|
|
153
|
+
from .glm4_moe import *
|
|
147
154
|
from .glm4v import *
|
|
148
155
|
from .glm4v_moe import *
|
|
149
156
|
from .glm46v import *
|
|
157
|
+
from .glmasr import *
|
|
150
158
|
from .glpn import *
|
|
151
159
|
from .got_ocr2 import *
|
|
152
160
|
from .gpt2 import *
|
|
@@ -181,10 +189,12 @@ if TYPE_CHECKING:
|
|
|
181
189
|
from .instructblip import *
|
|
182
190
|
from .instructblipvideo import *
|
|
183
191
|
from .internvl import *
|
|
192
|
+
from .jais2 import *
|
|
184
193
|
from .jamba import *
|
|
185
194
|
from .janus import *
|
|
186
195
|
from .jetmoe import *
|
|
187
196
|
from .kosmos2 import *
|
|
197
|
+
from .kosmos2_5 import *
|
|
188
198
|
from .kyutai_speech_to_text import *
|
|
189
199
|
from .lasr import *
|
|
190
200
|
from .layoutlm import *
|
|
@@ -220,6 +230,7 @@ if TYPE_CHECKING:
|
|
|
220
230
|
from .mbart50 import *
|
|
221
231
|
from .megatron_bert import *
|
|
222
232
|
from .megatron_gpt2 import *
|
|
233
|
+
from .metaclip_2 import *
|
|
223
234
|
from .mgp_str import *
|
|
224
235
|
from .mimi import *
|
|
225
236
|
from .minimax import *
|
|
@@ -231,6 +242,7 @@ if TYPE_CHECKING:
|
|
|
231
242
|
from .mlcd import *
|
|
232
243
|
from .mllama import *
|
|
233
244
|
from .mluke import *
|
|
245
|
+
from .mm_grounding_dino import *
|
|
234
246
|
from .mobilebert import *
|
|
235
247
|
from .mobilenet_v1 import *
|
|
236
248
|
from .mobilenet_v2 import *
|
|
@@ -270,6 +282,9 @@ if TYPE_CHECKING:
|
|
|
270
282
|
from .parakeet import *
|
|
271
283
|
from .patchtsmixer import *
|
|
272
284
|
from .patchtst import *
|
|
285
|
+
from .pe_audio import *
|
|
286
|
+
from .pe_audio_video import *
|
|
287
|
+
from .pe_video import *
|
|
273
288
|
from .pegasus import *
|
|
274
289
|
from .pegasus_x import *
|
|
275
290
|
from .perceiver import *
|
|
@@ -281,6 +296,7 @@ if TYPE_CHECKING:
|
|
|
281
296
|
from .phimoe import *
|
|
282
297
|
from .phobert import *
|
|
283
298
|
from .pix2struct import *
|
|
299
|
+
from .pixio import *
|
|
284
300
|
from .pixtral import *
|
|
285
301
|
from .plbart import *
|
|
286
302
|
from .poolformer import *
|
|
@@ -317,8 +333,10 @@ if TYPE_CHECKING:
|
|
|
317
333
|
from .sam import *
|
|
318
334
|
from .sam2 import *
|
|
319
335
|
from .sam2_video import *
|
|
336
|
+
from .sam3 import *
|
|
320
337
|
from .sam3_tracker import *
|
|
321
338
|
from .sam3_tracker_video import *
|
|
339
|
+
from .sam3_video import *
|
|
322
340
|
from .sam_hq import *
|
|
323
341
|
from .seamless_m4t import *
|
|
324
342
|
from .seamless_m4t_v2 import *
|
|
@@ -330,6 +348,7 @@ if TYPE_CHECKING:
|
|
|
330
348
|
from .shieldgemma2 import *
|
|
331
349
|
from .siglip import *
|
|
332
350
|
from .siglip2 import *
|
|
351
|
+
from .smollm3 import *
|
|
333
352
|
from .smolvlm import *
|
|
334
353
|
from .speech_encoder_decoder import *
|
|
335
354
|
from .speech_to_text import *
|
|
@@ -25,11 +25,11 @@ from typing import Optional, Union
|
|
|
25
25
|
import torch
|
|
26
26
|
from torch import nn
|
|
27
27
|
|
|
28
|
+
from ... import initialization as init
|
|
28
29
|
from ...activations import ACT2FN
|
|
29
30
|
from ...cache_utils import Cache, DynamicCache
|
|
30
31
|
from ...generation import GenerationMixin
|
|
31
|
-
from ...integrations import use_kernel_func_from_hub, use_kernelized_func
|
|
32
|
-
from ...integrations.hub_kernels import use_kernel_forward_from_hub
|
|
32
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
33
33
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
34
34
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
35
35
|
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, MoeModelOutputWithPast
|
|
@@ -58,7 +58,7 @@ class AfmoeRotaryEmbedding(nn.Module):
|
|
|
58
58
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
59
59
|
|
|
60
60
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
61
|
-
self.original_inv_freq =
|
|
61
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
62
62
|
|
|
63
63
|
@staticmethod
|
|
64
64
|
def compute_default_rope_parameters(
|
|
@@ -531,20 +531,11 @@ class AfmoePreTrainedModel(PreTrainedModel):
|
|
|
531
531
|
|
|
532
532
|
def _init_weights(self, module):
|
|
533
533
|
"""Initialize the weights"""
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
nn.init.zeros_(module.bias)
|
|
538
|
-
elif isinstance(module, nn.Embedding):
|
|
539
|
-
nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
540
|
-
if module.padding_idx is not None:
|
|
541
|
-
nn.init.zeros_(module.weight[module.padding_idx])
|
|
542
|
-
elif isinstance(module, AfmoeRMSNorm):
|
|
543
|
-
nn.init.ones_(module.weight)
|
|
544
|
-
elif isinstance(module, AfmoeTokenChoiceRouter):
|
|
545
|
-
nn.init.zeros_(module.gate.weight)
|
|
534
|
+
super()._init_weights(module)
|
|
535
|
+
if isinstance(module, AfmoeTokenChoiceRouter):
|
|
536
|
+
init.zeros_(module.gate.weight)
|
|
546
537
|
elif isinstance(module, AfmoeMoE):
|
|
547
|
-
|
|
538
|
+
init.zeros_(module.expert_bias)
|
|
548
539
|
|
|
549
540
|
|
|
550
541
|
@auto_docstring
|
|
@@ -20,6 +20,7 @@ from typing import Optional
|
|
|
20
20
|
import torch
|
|
21
21
|
from torch import nn
|
|
22
22
|
|
|
23
|
+
from ... import initialization as init
|
|
23
24
|
from ...cache_utils import Cache, DynamicCache
|
|
24
25
|
from ...generation import GenerationMixin
|
|
25
26
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
@@ -350,20 +351,11 @@ class AfmoePreTrainedModel(PreTrainedModel):
|
|
|
350
351
|
|
|
351
352
|
def _init_weights(self, module):
|
|
352
353
|
"""Initialize the weights"""
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
nn.init.zeros_(module.bias)
|
|
357
|
-
elif isinstance(module, nn.Embedding):
|
|
358
|
-
nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
359
|
-
if module.padding_idx is not None:
|
|
360
|
-
nn.init.zeros_(module.weight[module.padding_idx])
|
|
361
|
-
elif isinstance(module, AfmoeRMSNorm):
|
|
362
|
-
nn.init.ones_(module.weight)
|
|
363
|
-
elif isinstance(module, AfmoeTokenChoiceRouter):
|
|
364
|
-
nn.init.zeros_(module.gate.weight)
|
|
354
|
+
super()._init_weights(module)
|
|
355
|
+
if isinstance(module, AfmoeTokenChoiceRouter):
|
|
356
|
+
init.zeros_(module.gate.weight)
|
|
365
357
|
elif isinstance(module, AfmoeMoE):
|
|
366
|
-
|
|
358
|
+
init.zeros_(module.expert_bias)
|
|
367
359
|
|
|
368
360
|
|
|
369
361
|
@auto_docstring
|
|
@@ -414,6 +414,10 @@ class Aimv2PreTrainedModel(PreTrainedModel):
|
|
|
414
414
|
init.constant_(module.logit_scale, math.log(1 / 0.07))
|
|
415
415
|
elif isinstance(module, Aimv2AttentionPoolingHead):
|
|
416
416
|
init.normal_(module.cls_token, mean=0.0, std=self.config.initializer_range)
|
|
417
|
+
elif isinstance(module, Aimv2VisionEmbeddings):
|
|
418
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
419
|
+
elif isinstance(module, Aimv2TextEmbeddings):
|
|
420
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
417
421
|
|
|
418
422
|
|
|
419
423
|
@auto_docstring(
|
|
@@ -457,6 +457,10 @@ class Aimv2PreTrainedModel(PreTrainedModel):
|
|
|
457
457
|
init.constant_(module.logit_scale, math.log(1 / 0.07))
|
|
458
458
|
elif isinstance(module, Aimv2AttentionPoolingHead):
|
|
459
459
|
init.normal_(module.cls_token, mean=0.0, std=self.config.initializer_range)
|
|
460
|
+
elif isinstance(module, Aimv2VisionEmbeddings):
|
|
461
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
462
|
+
elif isinstance(module, Aimv2TextEmbeddings):
|
|
463
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
460
464
|
|
|
461
465
|
|
|
462
466
|
@auto_docstring(
|
|
@@ -320,6 +320,9 @@ class AlbertPreTrainedModel(PreTrainedModel):
|
|
|
320
320
|
init.ones_(module.weight)
|
|
321
321
|
elif isinstance(module, AlbertMLMHead):
|
|
322
322
|
init.zeros_(module.bias)
|
|
323
|
+
elif isinstance(module, AlbertEmbeddings):
|
|
324
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
325
|
+
init.zeros_(module.token_type_ids)
|
|
323
326
|
|
|
324
327
|
|
|
325
328
|
@dataclass
|
|
@@ -781,9 +781,9 @@ class AlignTextEncoder(nn.Module):
|
|
|
781
781
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
|
782
782
|
|
|
783
783
|
layer_outputs = layer_module(
|
|
784
|
-
hidden_states
|
|
785
|
-
attention_mask
|
|
786
|
-
output_attentions
|
|
784
|
+
hidden_states,
|
|
785
|
+
attention_mask,
|
|
786
|
+
output_attentions,
|
|
787
787
|
**kwargs,
|
|
788
788
|
)
|
|
789
789
|
|
|
@@ -844,6 +844,13 @@ class AlignPreTrainedModel(PreTrainedModel):
|
|
|
844
844
|
if isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
|
|
845
845
|
init.zeros_(module.bias)
|
|
846
846
|
init.ones_(module.weight)
|
|
847
|
+
if getattr(module, "running_mean", None) is not None:
|
|
848
|
+
init.zeros_(module.running_mean)
|
|
849
|
+
init.ones_(module.running_var)
|
|
850
|
+
init.zeros_(module.num_batches_tracked)
|
|
851
|
+
elif isinstance(module, AlignTextEmbeddings):
|
|
852
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
853
|
+
init.zeros_(module.token_type_ids)
|
|
847
854
|
|
|
848
855
|
|
|
849
856
|
@auto_docstring(
|
|
@@ -976,6 +983,8 @@ class AlignVisionModel(AlignPreTrainedModel):
|
|
|
976
983
|
main_input_name = "pixel_values"
|
|
977
984
|
input_modalities = ("image",)
|
|
978
985
|
supports_gradient_checkpointing = False
|
|
986
|
+
_input_embed_layer = "convolution"
|
|
987
|
+
_no_split_modules = ["AlignVisionBlock"]
|
|
979
988
|
|
|
980
989
|
def __init__(self, config: AlignVisionConfig):
|
|
981
990
|
super().__init__(config)
|
|
@@ -994,9 +1003,6 @@ class AlignVisionModel(AlignPreTrainedModel):
|
|
|
994
1003
|
# Initialize weights and apply final processing
|
|
995
1004
|
self.post_init()
|
|
996
1005
|
|
|
997
|
-
def get_input_embeddings(self) -> nn.Module:
|
|
998
|
-
return self.vision_model.embeddings.convolution
|
|
999
|
-
|
|
1000
1006
|
@can_return_tuple
|
|
1001
1007
|
@auto_docstring
|
|
1002
1008
|
def forward(
|
|
@@ -393,9 +393,9 @@ class AltRobertaEncoder(nn.Module):
|
|
|
393
393
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
|
394
394
|
|
|
395
395
|
layer_outputs = layer_module(
|
|
396
|
-
hidden_states
|
|
397
|
-
attention_mask
|
|
398
|
-
output_attentions
|
|
396
|
+
hidden_states,
|
|
397
|
+
attention_mask,
|
|
398
|
+
output_attentions,
|
|
399
399
|
**kwargs,
|
|
400
400
|
)
|
|
401
401
|
|
|
@@ -780,6 +780,7 @@ class AltCLIPPreTrainedModel(PreTrainedModel):
|
|
|
780
780
|
init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
|
|
781
781
|
init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
|
|
782
782
|
init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
|
|
783
|
+
init.copy_(module.position_ids, torch.arange(module.num_positions).expand((1, -1)))
|
|
783
784
|
elif isinstance(module, AltCLIPAttention):
|
|
784
785
|
factor = self.config.initializer_factor
|
|
785
786
|
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
|
|
@@ -815,6 +816,9 @@ class AltCLIPPreTrainedModel(PreTrainedModel):
|
|
|
815
816
|
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
|
|
816
817
|
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
|
|
817
818
|
init.zeros_(module.weight[module.padding_idx])
|
|
819
|
+
elif isinstance(module, AltRobertaEmbeddings):
|
|
820
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
821
|
+
init.zeros_(module.token_type_ids)
|
|
818
822
|
|
|
819
823
|
|
|
820
824
|
class AltCLIPVisionTransformer(nn.Module):
|
|
@@ -25,7 +25,7 @@ from typing import Optional, Union
|
|
|
25
25
|
import torch
|
|
26
26
|
from torch import nn
|
|
27
27
|
|
|
28
|
-
from ...activations import ACT2FN
|
|
28
|
+
from ...activations import ACT2CLS, ACT2FN
|
|
29
29
|
from ...cache_utils import Cache, DynamicCache
|
|
30
30
|
from ...generation import GenerationMixin
|
|
31
31
|
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
@@ -49,6 +49,8 @@ class ApertusMLP(nn.Module):
|
|
|
49
49
|
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
|
50
50
|
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
|
51
51
|
self.act_fn = ACT2FN[config.hidden_act]
|
|
52
|
+
if config.hidden_act == "xielu":
|
|
53
|
+
self.act_fn = ACT2CLS["xielu"](dtype=config.dtype)
|
|
52
54
|
|
|
53
55
|
def forward(self, x):
|
|
54
56
|
return self.down_proj(self.act_fn(self.up_proj(x)))
|
|
@@ -92,7 +94,7 @@ class ApertusRotaryEmbedding(nn.Module):
|
|
|
92
94
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
93
95
|
|
|
94
96
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
95
|
-
self.original_inv_freq =
|
|
97
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
96
98
|
|
|
97
99
|
@staticmethod
|
|
98
100
|
def compute_default_rope_parameters(
|
|
@@ -19,6 +19,7 @@ from typing import Optional
|
|
|
19
19
|
import torch
|
|
20
20
|
from torch import nn
|
|
21
21
|
|
|
22
|
+
from ...activations import ACT2CLS
|
|
22
23
|
from ...cache_utils import Cache
|
|
23
24
|
from ...configuration_utils import PreTrainedConfig
|
|
24
25
|
from ...modeling_rope_utils import RopeParameters
|
|
@@ -192,9 +193,11 @@ class ApertusConfig(PreTrainedConfig):
|
|
|
192
193
|
|
|
193
194
|
class ApertusMLP(NemotronMLP):
|
|
194
195
|
def __init__(self, config):
|
|
195
|
-
super().__init__()
|
|
196
|
+
super().__init__(config)
|
|
196
197
|
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
|
197
198
|
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
|
199
|
+
if config.hidden_act == "xielu":
|
|
200
|
+
self.act_fn = ACT2CLS["xielu"](dtype=config.dtype)
|
|
198
201
|
|
|
199
202
|
|
|
200
203
|
class ApertusRMSNorm(LlamaRMSNorm):
|
|
@@ -99,7 +99,7 @@ class ArceeRotaryEmbedding(nn.Module):
|
|
|
99
99
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
100
100
|
|
|
101
101
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
102
|
-
self.original_inv_freq =
|
|
102
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
103
103
|
|
|
104
104
|
@staticmethod
|
|
105
105
|
def compute_default_rope_parameters(
|
|
@@ -636,7 +636,7 @@ class AriaTextRotaryEmbedding(nn.Module):
|
|
|
636
636
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
637
637
|
|
|
638
638
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
639
|
-
self.original_inv_freq =
|
|
639
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
640
640
|
|
|
641
641
|
@staticmethod
|
|
642
642
|
def compute_default_rope_parameters(
|
|
@@ -1203,6 +1203,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
|
|
|
1203
1203
|
attention_mask=None,
|
|
1204
1204
|
cache_position=None,
|
|
1205
1205
|
logits_to_keep=None,
|
|
1206
|
+
is_first_iteration=False,
|
|
1206
1207
|
**kwargs,
|
|
1207
1208
|
):
|
|
1208
1209
|
model_inputs = super().prepare_inputs_for_generation(
|
|
@@ -1212,12 +1213,15 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
|
|
|
1212
1213
|
attention_mask=attention_mask,
|
|
1213
1214
|
cache_position=cache_position,
|
|
1214
1215
|
logits_to_keep=logits_to_keep,
|
|
1216
|
+
is_first_iteration=is_first_iteration,
|
|
1215
1217
|
**kwargs,
|
|
1216
1218
|
)
|
|
1217
1219
|
|
|
1218
|
-
if
|
|
1219
|
-
#
|
|
1220
|
-
#
|
|
1220
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
1221
|
+
# Pixel values are used only in the first iteration if available
|
|
1222
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
1223
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
1224
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
1221
1225
|
model_inputs["pixel_values"] = pixel_values
|
|
1222
1226
|
model_inputs["pixel_mask"] = pixel_mask
|
|
1223
1227
|
|
|
@@ -1500,6 +1500,7 @@ class AriaForConditionalGeneration(LlavaForConditionalGeneration):
|
|
|
1500
1500
|
attention_mask=None,
|
|
1501
1501
|
cache_position=None,
|
|
1502
1502
|
logits_to_keep=None,
|
|
1503
|
+
is_first_iteration=False,
|
|
1503
1504
|
**kwargs,
|
|
1504
1505
|
):
|
|
1505
1506
|
model_inputs = super().prepare_inputs_for_generation(
|
|
@@ -1509,12 +1510,15 @@ class AriaForConditionalGeneration(LlavaForConditionalGeneration):
|
|
|
1509
1510
|
attention_mask=attention_mask,
|
|
1510
1511
|
cache_position=cache_position,
|
|
1511
1512
|
logits_to_keep=logits_to_keep,
|
|
1513
|
+
is_first_iteration=is_first_iteration,
|
|
1512
1514
|
**kwargs,
|
|
1513
1515
|
)
|
|
1514
1516
|
|
|
1515
|
-
if
|
|
1516
|
-
#
|
|
1517
|
-
#
|
|
1517
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
1518
|
+
# Pixel values are used only in the first iteration if available
|
|
1519
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
1520
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
1521
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
1518
1522
|
model_inputs["pixel_values"] = pixel_values
|
|
1519
1523
|
model_inputs["pixel_mask"] = pixel_mask
|
|
1520
1524
|
|
|
@@ -32,9 +32,6 @@ if is_torch_available():
|
|
|
32
32
|
|
|
33
33
|
logger = logging.get_logger(__name__)
|
|
34
34
|
|
|
35
|
-
MAX_AUDIO_LEN = 10 * 60 # 10 minutes
|
|
36
|
-
DEFAULT_TRANSCRIPTION_PROMPT = "Transcribe the input speech."
|
|
37
|
-
|
|
38
35
|
|
|
39
36
|
class AudioFlamingo3ProcessorKwargs(ProcessingKwargs, total=False):
|
|
40
37
|
_defaults = {
|
|
@@ -63,32 +60,41 @@ class AudioFlamingo3Processor(ProcessorMixin):
|
|
|
63
60
|
[`Qwen2TokenizerFast`]. See the [`~AudioFlamingo3Processor.__call__`] for more information.
|
|
64
61
|
|
|
65
62
|
Args:
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
63
|
+
feature_extractor ([`WhisperFeatureExtractor`]):
|
|
64
|
+
The feature extractor is a required input.
|
|
65
|
+
tokenizer ([`Qwen2TokenizerFast`]):
|
|
66
|
+
The tokenizer is a required input.
|
|
67
|
+
chat_template (`Optional[str]`, *optional*):
|
|
68
|
+
The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
|
|
69
|
+
template will be used.
|
|
70
|
+
audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
|
|
71
|
+
Special token used to represent audio inputs in the chat template.
|
|
72
|
+
default_transcription_prompt (`str`, *optional*, defaults to `"Transcribe the input speech."`):
|
|
73
|
+
Default prompt to use for transcription tasks when applying transcription requests.
|
|
74
|
+
max_audio_len (`int`, *optional*, defaults to 600):
|
|
75
|
+
Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
|
|
75
76
|
"""
|
|
76
77
|
|
|
77
|
-
attributes = ["feature_extractor", "tokenizer"]
|
|
78
|
-
feature_extractor_class = "WhisperFeatureExtractor"
|
|
79
|
-
tokenizer_class = "Qwen2TokenizerFast"
|
|
80
|
-
|
|
81
78
|
def __init__(
|
|
82
79
|
self,
|
|
83
80
|
feature_extractor,
|
|
84
81
|
tokenizer,
|
|
85
82
|
chat_template=None,
|
|
86
83
|
audio_token="<sound>",
|
|
84
|
+
default_transcription_prompt="Transcribe the input speech.",
|
|
85
|
+
max_audio_len=600,
|
|
87
86
|
):
|
|
88
87
|
self.audio_token = audio_token
|
|
89
88
|
self.audio_token_id = tokenizer.convert_tokens_to_ids(audio_token)
|
|
89
|
+
self.default_transcription_prompt = default_transcription_prompt
|
|
90
|
+
self.max_audio_len = max_audio_len
|
|
90
91
|
super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
|
|
91
92
|
|
|
93
|
+
def _get_audio_token_length(self, audio_lengths: "torch.Tensor") -> "torch.Tensor":
|
|
94
|
+
conv_output_lengths = (audio_lengths - 1) // 2 + 1 # After conv2 downsampling
|
|
95
|
+
audio_tokens_lengths = (conv_output_lengths - 2) // 2 + 1 # After avg pooling
|
|
96
|
+
return audio_tokens_lengths
|
|
97
|
+
|
|
92
98
|
def __call__(
|
|
93
99
|
self,
|
|
94
100
|
text: Union[TextInput, list[TextInput]],
|
|
@@ -143,7 +149,7 @@ class AudioFlamingo3Processor(ProcessorMixin):
|
|
|
143
149
|
|
|
144
150
|
# Determine number of chunks per sample, and flatten
|
|
145
151
|
window_size = int(audio_kwargs["sampling_rate"] * audio_kwargs["chunk_length"])
|
|
146
|
-
max_windows = int(
|
|
152
|
+
max_windows = int(self.max_audio_len // audio_kwargs["chunk_length"])
|
|
147
153
|
|
|
148
154
|
per_sample_windows: list[int] = []
|
|
149
155
|
flat_chunks: list[np.ndarray] = []
|
|
@@ -153,7 +159,7 @@ class AudioFlamingo3Processor(ProcessorMixin):
|
|
|
153
159
|
n_win = max(1, (n_samples + window_size - 1) // window_size)
|
|
154
160
|
if n_win > max_windows:
|
|
155
161
|
logger.warning(
|
|
156
|
-
f"Audio duration ({n_samples / audio_kwargs['sampling_rate']:.1f}s) exceeds {
|
|
162
|
+
f"Audio duration ({n_samples / audio_kwargs['sampling_rate']:.1f}s) exceeds {self.max_audio_len}s; truncating to first {self.max_audio_len}s."
|
|
157
163
|
)
|
|
158
164
|
n_win = max_windows
|
|
159
165
|
per_sample_windows.append(n_win)
|
|
@@ -171,8 +177,7 @@ class AudioFlamingo3Processor(ProcessorMixin):
|
|
|
171
177
|
|
|
172
178
|
# Compute sequence lengths token counting
|
|
173
179
|
audio_lengths = torch.stack([s.sum() for s in torch.split(padding_mask.sum(-1), per_sample_windows)])
|
|
174
|
-
|
|
175
|
-
audio_tokens_lengths = (conv_output_lengths - 2) // 2 + 1 # After avg pooling
|
|
180
|
+
audio_tokens_lengths = self._get_audio_token_length(audio_lengths)
|
|
176
181
|
|
|
177
182
|
# expand audio tokens in text
|
|
178
183
|
for i, audio_length in enumerate(audio_tokens_lengths):
|
|
@@ -236,7 +241,7 @@ class AudioFlamingo3Processor(ProcessorMixin):
|
|
|
236
241
|
raise ValueError("`audio` must contain at least one sample.")
|
|
237
242
|
|
|
238
243
|
if prompt is None:
|
|
239
|
-
prompts = [
|
|
244
|
+
prompts = [self.default_transcription_prompt] * batch_size
|
|
240
245
|
elif isinstance(prompt, str):
|
|
241
246
|
prompts = [prompt] * batch_size
|
|
242
247
|
elif isinstance(prompt, (list, tuple)):
|
|
@@ -247,7 +252,7 @@ class AudioFlamingo3Processor(ProcessorMixin):
|
|
|
247
252
|
prompts = []
|
|
248
253
|
for item in prompt:
|
|
249
254
|
if item is None:
|
|
250
|
-
prompts.append(
|
|
255
|
+
prompts.append(self.default_transcription_prompt)
|
|
251
256
|
elif isinstance(item, str):
|
|
252
257
|
prompts.append(item)
|
|
253
258
|
else:
|
|
@@ -543,7 +543,7 @@ def add_generation_mixin_to_remote_model(model_class):
|
|
|
543
543
|
|
|
544
544
|
class _LazyAutoMapping(OrderedDict[type[PreTrainedConfig], _LazyAutoMappingValue]):
|
|
545
545
|
"""
|
|
546
|
-
|
|
546
|
+
A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed.
|
|
547
547
|
|
|
548
548
|
Args:
|
|
549
549
|
- config_mapping: The map model type to config class
|
|
@@ -142,6 +142,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
|
|
142
142
|
("ernie", "ErnieConfig"),
|
|
143
143
|
("ernie4_5", "Ernie4_5Config"),
|
|
144
144
|
("ernie4_5_moe", "Ernie4_5_MoeConfig"),
|
|
145
|
+
("ernie4_5_vl_moe", "Ernie4_5_VL_MoeConfig"),
|
|
145
146
|
("esm", "EsmConfig"),
|
|
146
147
|
("evolla", "EvollaConfig"),
|
|
147
148
|
("exaone4", "Exaone4Config"),
|
|
@@ -179,6 +180,8 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
|
|
179
180
|
("glm4v_moe_vision", "Glm4vMoeVisionConfig"),
|
|
180
181
|
("glm4v_text", "Glm4vTextConfig"),
|
|
181
182
|
("glm4v_vision", "Glm4vVisionConfig"),
|
|
183
|
+
("glmasr", "GlmAsrConfig"),
|
|
184
|
+
("glmasr_encoder", "GlmAsrEncoderConfig"),
|
|
182
185
|
("glpn", "GLPNConfig"),
|
|
183
186
|
("got_ocr2", "GotOcr2Config"),
|
|
184
187
|
("gpt-sw3", "GPT2Config"),
|
|
@@ -215,6 +218,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
|
|
215
218
|
("instructblipvideo", "InstructBlipVideoConfig"),
|
|
216
219
|
("internvl", "InternVLConfig"),
|
|
217
220
|
("internvl_vision", "InternVLVisionConfig"),
|
|
221
|
+
("jais2", "Jais2Config"),
|
|
218
222
|
("jamba", "JambaConfig"),
|
|
219
223
|
("janus", "JanusConfig"),
|
|
220
224
|
("jetmoe", "JetMoeConfig"),
|
|
@@ -306,6 +310,12 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
|
|
306
310
|
("parakeet_encoder", "ParakeetEncoderConfig"),
|
|
307
311
|
("patchtsmixer", "PatchTSMixerConfig"),
|
|
308
312
|
("patchtst", "PatchTSTConfig"),
|
|
313
|
+
("pe_audio", "PeAudioConfig"),
|
|
314
|
+
("pe_audio_encoder", "PeAudioEncoderConfig"),
|
|
315
|
+
("pe_audio_video", "PeAudioVideoConfig"),
|
|
316
|
+
("pe_audio_video_encoder", "PeAudioVideoEncoderConfig"),
|
|
317
|
+
("pe_video", "PeVideoConfig"),
|
|
318
|
+
("pe_video_encoder", "PeVideoEncoderConfig"),
|
|
309
319
|
("pegasus", "PegasusConfig"),
|
|
310
320
|
("pegasus_x", "PegasusXConfig"),
|
|
311
321
|
("perceiver", "PerceiverConfig"),
|
|
@@ -316,6 +326,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
|
|
316
326
|
("phi4_multimodal", "Phi4MultimodalConfig"),
|
|
317
327
|
("phimoe", "PhimoeConfig"),
|
|
318
328
|
("pix2struct", "Pix2StructConfig"),
|
|
329
|
+
("pixio", "PixioConfig"),
|
|
319
330
|
("pixtral", "PixtralVisionConfig"),
|
|
320
331
|
("plbart", "PLBartConfig"),
|
|
321
332
|
("poolformer", "PoolFormerConfig"),
|
|
@@ -582,6 +593,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
|
|
582
593
|
("ernie", "ERNIE"),
|
|
583
594
|
("ernie4_5", "Ernie4_5"),
|
|
584
595
|
("ernie4_5_moe", "Ernie4_5_MoE"),
|
|
596
|
+
("ernie4_5_vl_moe", "Ernie4_5_VL_MoE"),
|
|
585
597
|
("esm", "ESM"),
|
|
586
598
|
("evolla", "Evolla"),
|
|
587
599
|
("exaone4", "EXAONE-4.0"),
|
|
@@ -622,6 +634,8 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
|
|
622
634
|
("glm4v_moe_vision", "Glm4vMoeVisionModel"),
|
|
623
635
|
("glm4v_text", "GLM4V"),
|
|
624
636
|
("glm4v_vision", "Glm4vVisionModel"),
|
|
637
|
+
("glmasr", "GLM-ASR"),
|
|
638
|
+
("glmasr_encoder", "GLM-ASR Encoder"),
|
|
625
639
|
("glpn", "GLPN"),
|
|
626
640
|
("got_ocr2", "GOT-OCR2"),
|
|
627
641
|
("gpt-sw3", "GPT-Sw3"),
|
|
@@ -659,6 +673,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
|
|
659
673
|
("instructblipvideo", "InstructBlipVideo"),
|
|
660
674
|
("internvl", "InternVL"),
|
|
661
675
|
("internvl_vision", "InternVLVision"),
|
|
676
|
+
("jais2", "Jais2"),
|
|
662
677
|
("jamba", "Jamba"),
|
|
663
678
|
("janus", "Janus"),
|
|
664
679
|
("jetmoe", "JetMoe"),
|
|
@@ -762,6 +777,12 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
|
|
762
777
|
("parakeet_encoder", "ParakeetEncoder"),
|
|
763
778
|
("patchtsmixer", "PatchTSMixer"),
|
|
764
779
|
("patchtst", "PatchTST"),
|
|
780
|
+
("pe_audio", "PeAudio"),
|
|
781
|
+
("pe_audio_encoder", "PeAudioEncoder"),
|
|
782
|
+
("pe_audio_video", "PeAudioVideo"),
|
|
783
|
+
("pe_audio_video_encoder", "PeAudioVideoEncoder"),
|
|
784
|
+
("pe_video", "PeVideo"),
|
|
785
|
+
("pe_video_encoder", "PeVideoEncoder"),
|
|
765
786
|
("pegasus", "Pegasus"),
|
|
766
787
|
("pegasus_x", "PEGASUS-X"),
|
|
767
788
|
("perceiver", "Perceiver"),
|
|
@@ -773,6 +794,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
|
|
773
794
|
("phimoe", "Phimoe"),
|
|
774
795
|
("phobert", "PhoBERT"),
|
|
775
796
|
("pix2struct", "Pix2Struct"),
|
|
797
|
+
("pixio", "Pixio"),
|
|
776
798
|
("pixtral", "Pixtral"),
|
|
777
799
|
("plbart", "PLBart"),
|
|
778
800
|
("poolformer", "PoolFormer"),
|
|
@@ -955,6 +977,7 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
|
|
|
955
977
|
("glm4v_moe_vision", "glm4v_moe"),
|
|
956
978
|
("glm4v_text", "glm4v"),
|
|
957
979
|
("glm4v_moe_text", "glm4v_moe"),
|
|
980
|
+
("glmasr_encoder", "glmasr"),
|
|
958
981
|
("grounding-dino", "grounding_dino"),
|
|
959
982
|
("mm-grounding-dino", "mm_grounding_dino"),
|
|
960
983
|
("idefics3_vision", "idefics3"),
|
|
@@ -981,6 +1004,10 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
|
|
|
981
1004
|
("llama4_text", "llama4"),
|
|
982
1005
|
("blip_2_qformer", "blip_2"),
|
|
983
1006
|
("fastspeech2_conformer_with_hifigan", "fastspeech2_conformer"),
|
|
1007
|
+
("perception_encoder", "perception_lm"),
|
|
1008
|
+
("pe_audio_encoder", "pe_audio"),
|
|
1009
|
+
("pe_video_encoder", "pe_video"),
|
|
1010
|
+
("pe_audio_video_encoder", "pe_audio_video"),
|
|
984
1011
|
("video_llama_3_vision", "video_llama_3"),
|
|
985
1012
|
("parakeet_encoder", "parakeet"),
|
|
986
1013
|
("parakeet_ctc", "parakeet"),
|