transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -62,7 +62,11 @@ from ..qwen2_5_omni.modeling_qwen2_5_omni import (
|
|
|
62
62
|
Qwen2_5OmniThinkerForConditionalGeneration,
|
|
63
63
|
SnakeBeta,
|
|
64
64
|
)
|
|
65
|
-
from ..qwen2_5_omni.processing_qwen2_5_omni import
|
|
65
|
+
from ..qwen2_5_omni.processing_qwen2_5_omni import (
|
|
66
|
+
Qwen2_5OmniProcessor,
|
|
67
|
+
Qwen2_5OmniProcessorKwargs,
|
|
68
|
+
SinusoidsPositionEmbedding,
|
|
69
|
+
)
|
|
66
70
|
from ..qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
|
|
67
71
|
from ..qwen3.configuration_qwen3 import Qwen3Config
|
|
68
72
|
from ..qwen3.modeling_qwen3 import (
|
|
@@ -91,6 +95,7 @@ from ..qwen3_vl_moe.modeling_qwen3_vl_moe import (
|
|
|
91
95
|
Qwen3VLMoeTextRotaryEmbedding,
|
|
92
96
|
Qwen3VLMoeVisionAttention,
|
|
93
97
|
Qwen3VLMoeVisionModel,
|
|
98
|
+
Qwen3VLMoeVisionRotaryEmbedding,
|
|
94
99
|
)
|
|
95
100
|
|
|
96
101
|
|
|
@@ -668,6 +673,7 @@ class Qwen3OmniMoeTalkerConfig(PreTrainedConfig):
|
|
|
668
673
|
self.audio_start_token_id = audio_start_token_id
|
|
669
674
|
self.vision_start_token_id = vision_start_token_id
|
|
670
675
|
self.speaker_id = speaker_id
|
|
676
|
+
self.initializer_range = self.text_config.initializer_range
|
|
671
677
|
super().__init__(**kwargs)
|
|
672
678
|
|
|
673
679
|
|
|
@@ -758,6 +764,7 @@ class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig):
|
|
|
758
764
|
upsampling_ratios=(2, 2),
|
|
759
765
|
decoder_dim=1536,
|
|
760
766
|
attention_dropout=0.0,
|
|
767
|
+
initializer_range=0.02,
|
|
761
768
|
**kwargs,
|
|
762
769
|
):
|
|
763
770
|
self.codebook_size = codebook_size
|
|
@@ -777,6 +784,7 @@ class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig):
|
|
|
777
784
|
self.upsampling_ratios = upsampling_ratios
|
|
778
785
|
self.decoder_dim = decoder_dim
|
|
779
786
|
self.attention_dropout = attention_dropout
|
|
787
|
+
self.initializer_range = initializer_range
|
|
780
788
|
self.rope_parameters = rope_parameters
|
|
781
789
|
|
|
782
790
|
super().__init__(**kwargs)
|
|
@@ -865,6 +873,7 @@ class Qwen3OmniMoeConfig(PreTrainedConfig):
|
|
|
865
873
|
self.thinker_config = Qwen3OmniMoeThinkerConfig(**thinker_config)
|
|
866
874
|
self.talker_config = Qwen3OmniMoeTalkerConfig(**talker_config)
|
|
867
875
|
self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**code2wav_config)
|
|
876
|
+
self.initializer_range = self.thinker_config.initializer_range
|
|
868
877
|
self.enable_audio_output = enable_audio_output
|
|
869
878
|
self.im_start_token_id = im_start_token_id
|
|
870
879
|
self.im_end_token_id = im_end_token_id
|
|
@@ -900,6 +909,19 @@ class Qwen3OmniMoePreTrainedModel(Qwen2_5OmniPreTrainedModel, PreTrainedModel):
|
|
|
900
909
|
init.normal_(module.experts.gate_up_proj, mean=0.0, std=std)
|
|
901
910
|
init.normal_(module.experts.down_proj, mean=0.0, std=std)
|
|
902
911
|
init.normal_(module.gate.weight, mean=0.0, std=std)
|
|
912
|
+
elif isinstance(module, Qwen3OmniMoeCode2Wav):
|
|
913
|
+
init.copy_(
|
|
914
|
+
module.code_offset,
|
|
915
|
+
torch.arange(module.config.num_quantizers).view(1, -1, 1) * module.config.codebook_size,
|
|
916
|
+
)
|
|
917
|
+
elif isinstance(module, SinusoidsPositionEmbedding):
|
|
918
|
+
log_timescale_increment = np.log(module.max_timescale) / (module.channels // 2 - 1)
|
|
919
|
+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(module.channels // 2).float())
|
|
920
|
+
scaled_time = torch.arange(module.length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
|
921
|
+
init.copy_(module.positional_embedding, torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1))
|
|
922
|
+
elif isinstance(module, Qwen3OmniMoeVisionRotaryEmbedding):
|
|
923
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
924
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
903
925
|
|
|
904
926
|
|
|
905
927
|
class Qwen3OmniMoePreTrainedModelForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration):
|
|
@@ -1297,6 +1319,10 @@ class Qwen3OmniMoeVisionPatchMerger(nn.Module):
|
|
|
1297
1319
|
return hidden
|
|
1298
1320
|
|
|
1299
1321
|
|
|
1322
|
+
class Qwen3OmniMoeVisionRotaryEmbedding(Qwen3VLMoeVisionRotaryEmbedding):
|
|
1323
|
+
pass
|
|
1324
|
+
|
|
1325
|
+
|
|
1300
1326
|
class Qwen3OmniMoeVisionEncoder(Qwen3VLMoeVisionModel):
|
|
1301
1327
|
config: Qwen3OmniMoeVisionEncoderConfig
|
|
1302
1328
|
_no_split_modules = ["Qwen3OmniMoeVisionBlock"]
|
|
@@ -1847,6 +1873,9 @@ class Qwen3OmniMoeTalkerModel(Qwen3VLMoeTextModel):
|
|
|
1847
1873
|
|
|
1848
1874
|
|
|
1849
1875
|
class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM):
|
|
1876
|
+
_tied_weights_keys = {"codec_head": "model.codec_embedding.weight"}
|
|
1877
|
+
_tp_plan = {"codec_head": "colwise_rep"}
|
|
1878
|
+
_pp_plan = {"codec_head": (["hidden_states"], ["logits"])}
|
|
1850
1879
|
config_class = Qwen3OmniMoeTalkerConfig
|
|
1851
1880
|
base_model_prefix = "talker"
|
|
1852
1881
|
_no_split_modules = ["Qwen3OmniMoeTalkerCodePredictorModelForConditionalGeneration"]
|
|
@@ -2033,18 +2062,31 @@ class Qwen3OmniMoeTalkerForConditionalGeneration(Qwen3MoeForCausalLM):
|
|
|
2033
2062
|
return model_kwargs
|
|
2034
2063
|
|
|
2035
2064
|
def prepare_inputs_for_generation(
|
|
2036
|
-
self,
|
|
2065
|
+
self,
|
|
2066
|
+
input_ids,
|
|
2067
|
+
past_key_values=None,
|
|
2068
|
+
attention_mask=None,
|
|
2069
|
+
inputs_embeds=None,
|
|
2070
|
+
cache_position=None,
|
|
2071
|
+
is_first_iteration=False,
|
|
2072
|
+
**kwargs,
|
|
2037
2073
|
):
|
|
2038
2074
|
hidden_states = kwargs.pop("hidden_states", None)
|
|
2039
2075
|
inputs = super().prepare_inputs_for_generation(
|
|
2040
|
-
input_ids,
|
|
2076
|
+
input_ids,
|
|
2077
|
+
past_key_values,
|
|
2078
|
+
attention_mask,
|
|
2079
|
+
inputs_embeds,
|
|
2080
|
+
cache_position,
|
|
2081
|
+
is_first_iteration=is_first_iteration,
|
|
2082
|
+
**kwargs,
|
|
2041
2083
|
)
|
|
2042
2084
|
|
|
2043
2085
|
# Qwen3-Omni will prepare position ids in forward with deltas
|
|
2044
2086
|
inputs["position_ids"] = None
|
|
2045
2087
|
|
|
2046
2088
|
# TODO(raushan, gante): Refactor this part to a utility function
|
|
2047
|
-
if
|
|
2089
|
+
if not is_first_iteration and kwargs.get("use_cache", True):
|
|
2048
2090
|
input_ids = input_ids[:, -1:]
|
|
2049
2091
|
generation_step = kwargs.get("generation_step")
|
|
2050
2092
|
trailing_text_hidden = kwargs.get("trailing_text_hidden")
|
|
@@ -2337,6 +2379,8 @@ class Qwen3OmniMoeCode2WavDecoderBlock(Qwen3OmniMoePreTrainedModel):
|
|
|
2337
2379
|
|
|
2338
2380
|
self.block = nn.ModuleList(block)
|
|
2339
2381
|
|
|
2382
|
+
self.post_init()
|
|
2383
|
+
|
|
2340
2384
|
def forward(self, hidden, **kwargs):
|
|
2341
2385
|
for block in self.block:
|
|
2342
2386
|
hidden = block(hidden)
|
|
@@ -110,7 +110,7 @@ class Qwen3VLTextConfig(PreTrainedConfig):
|
|
|
110
110
|
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
111
111
|
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
112
112
|
with longer `max_position_embeddings`.
|
|
113
|
-
attention_bias (`bool`,
|
|
113
|
+
attention_bias (`bool`, *optional*, defaults to `False`):
|
|
114
114
|
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
|
115
115
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
116
116
|
The dropout ratio for the attention probabilities.
|
|
@@ -197,13 +197,13 @@ class Qwen3VLConfig(PreTrainedConfig):
|
|
|
197
197
|
vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLVisionConfig`):
|
|
198
198
|
The config object or dictionary of the vision backbone.
|
|
199
199
|
image_token_id (`int`, *optional*, defaults to 151655):
|
|
200
|
-
The
|
|
200
|
+
The token id used as the placeholder for image inputs.
|
|
201
201
|
video_token_id (`int`, *optional*, defaults to 151656):
|
|
202
|
-
The
|
|
202
|
+
The token id used as the placeholder for video inputs.
|
|
203
203
|
vision_start_token_id (`int`, *optional*, defaults to 151652):
|
|
204
|
-
The
|
|
204
|
+
The token id that marks the start of a vision segment (image or video).
|
|
205
205
|
vision_end_token_id (`int`, *optional*, defaults to 151653):
|
|
206
|
-
The
|
|
206
|
+
The token id that marks the end of a vision segment (image or video).
|
|
207
207
|
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
208
208
|
Whether to tie the word embeddings.
|
|
209
209
|
|
|
@@ -27,6 +27,7 @@ import torch
|
|
|
27
27
|
import torch.nn as nn
|
|
28
28
|
import torch.nn.functional as F
|
|
29
29
|
|
|
30
|
+
from ... import initialization as init
|
|
30
31
|
from ...activations import ACT2FN
|
|
31
32
|
from ...cache_utils import Cache, DynamicCache
|
|
32
33
|
from ...generation import GenerationMixin
|
|
@@ -81,6 +82,8 @@ class Qwen3VLVisionRotaryEmbedding(nn.Module):
|
|
|
81
82
|
|
|
82
83
|
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
83
84
|
super().__init__()
|
|
85
|
+
self.dim = dim
|
|
86
|
+
self.theta = theta
|
|
84
87
|
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
85
88
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
86
89
|
|
|
@@ -202,8 +205,8 @@ class Qwen3VLVisionAttention(nn.Module):
|
|
|
202
205
|
if self.config._attn_implementation != "eager":
|
|
203
206
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
|
204
207
|
|
|
205
|
-
if self.config._attn_implementation
|
|
206
|
-
# Flash Attention
|
|
208
|
+
if "flash" in self.config._attn_implementation:
|
|
209
|
+
# Flash Attention: Use cu_seqlens for variable length attention
|
|
207
210
|
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
|
|
208
211
|
attn_output, _ = attention_interface(
|
|
209
212
|
self,
|
|
@@ -292,7 +295,7 @@ class Qwen3VLTextRotaryEmbedding(nn.Module):
|
|
|
292
295
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
293
296
|
|
|
294
297
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
295
|
-
self.original_inv_freq =
|
|
298
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
296
299
|
|
|
297
300
|
self.mrope_section = config.rope_parameters.get("mrope_section", [24, 20, 20])
|
|
298
301
|
|
|
@@ -592,6 +595,12 @@ class Qwen3VLPreTrainedModel(PreTrainedModel):
|
|
|
592
595
|
"attentions": Qwen3VLTextAttention,
|
|
593
596
|
}
|
|
594
597
|
|
|
598
|
+
def _init_weights(self, module):
|
|
599
|
+
super()._init_weights(module)
|
|
600
|
+
if isinstance(module, Qwen3VLVisionRotaryEmbedding):
|
|
601
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
602
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
603
|
+
|
|
595
604
|
|
|
596
605
|
class Qwen3VLVisionModel(Qwen3VLPreTrainedModel):
|
|
597
606
|
config: Qwen3VLVisionConfig
|
|
@@ -632,6 +641,8 @@ class Qwen3VLVisionModel(Qwen3VLPreTrainedModel):
|
|
|
632
641
|
|
|
633
642
|
self.gradient_checkpointing = False
|
|
634
643
|
|
|
644
|
+
self.post_init()
|
|
645
|
+
|
|
635
646
|
def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
|
|
636
647
|
merge_size = self.spatial_merge_size
|
|
637
648
|
|
|
@@ -1407,6 +1418,7 @@ class Qwen3VLForConditionalGeneration(Qwen3VLPreTrainedModel, GenerationMixin):
|
|
|
1407
1418
|
pixel_values_videos=None,
|
|
1408
1419
|
image_grid_thw=None,
|
|
1409
1420
|
video_grid_thw=None,
|
|
1421
|
+
is_first_iteration=False,
|
|
1410
1422
|
**kwargs,
|
|
1411
1423
|
):
|
|
1412
1424
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1423,6 +1435,7 @@ class Qwen3VLForConditionalGeneration(Qwen3VLPreTrainedModel, GenerationMixin):
|
|
|
1423
1435
|
image_grid_thw=image_grid_thw,
|
|
1424
1436
|
video_grid_thw=video_grid_thw,
|
|
1425
1437
|
use_cache=use_cache,
|
|
1438
|
+
is_first_iteration=is_first_iteration,
|
|
1426
1439
|
**kwargs,
|
|
1427
1440
|
)
|
|
1428
1441
|
|
|
@@ -1454,7 +1467,7 @@ class Qwen3VLForConditionalGeneration(Qwen3VLPreTrainedModel, GenerationMixin):
|
|
|
1454
1467
|
text_positions = model_inputs["position_ids"][None, ...]
|
|
1455
1468
|
model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
|
|
1456
1469
|
|
|
1457
|
-
if
|
|
1470
|
+
if not is_first_iteration and use_cache:
|
|
1458
1471
|
model_inputs["pixel_values"] = None
|
|
1459
1472
|
model_inputs["pixel_values_videos"] = None
|
|
1460
1473
|
|
|
@@ -22,6 +22,7 @@ import torch
|
|
|
22
22
|
import torch.nn as nn
|
|
23
23
|
import torch.nn.functional as F
|
|
24
24
|
|
|
25
|
+
from ... import initialization as init
|
|
25
26
|
from ...activations import ACT2FN
|
|
26
27
|
from ...cache_utils import Cache, DynamicCache
|
|
27
28
|
from ...configuration_utils import PreTrainedConfig
|
|
@@ -31,7 +32,7 @@ from ...masking_utils import create_causal_mask
|
|
|
31
32
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
32
33
|
from ...modeling_outputs import BaseModelOutputWithPast
|
|
33
34
|
from ...modeling_rope_utils import RopeParameters, dynamic_rope_update
|
|
34
|
-
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
35
|
+
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
35
36
|
from ...processing_utils import ProcessingKwargs, Unpack
|
|
36
37
|
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
|
37
38
|
from ...utils import auto_docstring, can_return_tuple, logging
|
|
@@ -151,7 +152,7 @@ class Qwen3VLTextConfig(PreTrainedConfig):
|
|
|
151
152
|
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
|
|
152
153
|
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
|
|
153
154
|
with longer `max_position_embeddings`.
|
|
154
|
-
attention_bias (`bool`,
|
|
155
|
+
attention_bias (`bool`, *optional*, defaults to `False`):
|
|
155
156
|
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
|
156
157
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
157
158
|
The dropout ratio for the attention probabilities.
|
|
@@ -238,13 +239,13 @@ class Qwen3VLConfig(PreTrainedConfig):
|
|
|
238
239
|
vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLVisionConfig`):
|
|
239
240
|
The config object or dictionary of the vision backbone.
|
|
240
241
|
image_token_id (`int`, *optional*, defaults to 151655):
|
|
241
|
-
The
|
|
242
|
+
The token id used as the placeholder for image inputs.
|
|
242
243
|
video_token_id (`int`, *optional*, defaults to 151656):
|
|
243
|
-
The
|
|
244
|
+
The token id used as the placeholder for video inputs.
|
|
244
245
|
vision_start_token_id (`int`, *optional*, defaults to 151652):
|
|
245
|
-
The
|
|
246
|
+
The token id that marks the start of a vision segment (image or video).
|
|
246
247
|
vision_end_token_id (`int`, *optional*, defaults to 151653):
|
|
247
|
-
The
|
|
248
|
+
The token id that marks the end of a vision segment (image or video).
|
|
248
249
|
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
249
250
|
Whether to tie the word embeddings.
|
|
250
251
|
|
|
@@ -488,6 +489,12 @@ class Qwen3VLPreTrainedModel(Qwen2VLPreTrainedModel):
|
|
|
488
489
|
"attentions": Qwen3VLTextAttention,
|
|
489
490
|
}
|
|
490
491
|
|
|
492
|
+
def _init_weights(self, module):
|
|
493
|
+
PreTrainedModel._init_weights(self, module)
|
|
494
|
+
if isinstance(module, Qwen3VLVisionRotaryEmbedding):
|
|
495
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
496
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
497
|
+
|
|
491
498
|
|
|
492
499
|
class Qwen3VLVisionModel(Qwen3VLPreTrainedModel):
|
|
493
500
|
config: Qwen3VLVisionConfig
|
|
@@ -528,6 +535,8 @@ class Qwen3VLVisionModel(Qwen3VLPreTrainedModel):
|
|
|
528
535
|
|
|
529
536
|
self.gradient_checkpointing = False
|
|
530
537
|
|
|
538
|
+
self.post_init()
|
|
539
|
+
|
|
531
540
|
def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
|
|
532
541
|
merge_size = self.spatial_merge_size
|
|
533
542
|
|
|
@@ -1190,6 +1199,7 @@ class Qwen3VLForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
|
|
1190
1199
|
pixel_values_videos=None,
|
|
1191
1200
|
image_grid_thw=None,
|
|
1192
1201
|
video_grid_thw=None,
|
|
1202
|
+
is_first_iteration=False,
|
|
1193
1203
|
**kwargs,
|
|
1194
1204
|
):
|
|
1195
1205
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1206,6 +1216,7 @@ class Qwen3VLForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
|
|
1206
1216
|
image_grid_thw=image_grid_thw,
|
|
1207
1217
|
video_grid_thw=video_grid_thw,
|
|
1208
1218
|
use_cache=use_cache,
|
|
1219
|
+
is_first_iteration=is_first_iteration,
|
|
1209
1220
|
**kwargs,
|
|
1210
1221
|
)
|
|
1211
1222
|
|
|
@@ -1237,7 +1248,7 @@ class Qwen3VLForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
|
|
|
1237
1248
|
text_positions = model_inputs["position_ids"][None, ...]
|
|
1238
1249
|
model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
|
|
1239
1250
|
|
|
1240
|
-
if
|
|
1251
|
+
if not is_first_iteration and use_cache:
|
|
1241
1252
|
model_inputs["pixel_values"] = None
|
|
1242
1253
|
model_inputs["pixel_values_videos"] = None
|
|
1243
1254
|
|
|
@@ -1393,9 +1404,9 @@ class Qwen3VLProcessor(Qwen2VLProcessor):
|
|
|
1393
1404
|
**kwargs: Unpack[Qwen3VLProcessorKwargs],
|
|
1394
1405
|
) -> BatchFeature:
|
|
1395
1406
|
"""
|
|
1396
|
-
Main method to prepare for the model one or several
|
|
1407
|
+
Main method to prepare for the model one or several sequence(s) and image(s). This method forwards the `text`
|
|
1397
1408
|
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
|
1398
|
-
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `
|
|
1409
|
+
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
|
|
1399
1410
|
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
|
|
1400
1411
|
|
|
1401
1412
|
Args:
|
|
@@ -1407,7 +1418,7 @@ class Qwen3VLProcessor(Qwen2VLProcessor):
|
|
|
1407
1418
|
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
|
1408
1419
|
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
|
1409
1420
|
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
|
1410
|
-
The
|
|
1421
|
+
The video or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
|
1411
1422
|
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
|
1412
1423
|
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
1413
1424
|
If set, will return tensors of a particular framework. Acceptable values are:
|
|
@@ -99,9 +99,9 @@ class Qwen3VLProcessor(ProcessorMixin):
|
|
|
99
99
|
**kwargs: Unpack[Qwen3VLProcessorKwargs],
|
|
100
100
|
) -> BatchFeature:
|
|
101
101
|
"""
|
|
102
|
-
Main method to prepare for the model one or several
|
|
102
|
+
Main method to prepare for the model one or several sequence(s) and image(s). This method forwards the `text`
|
|
103
103
|
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
|
|
104
|
-
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `
|
|
104
|
+
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
|
|
105
105
|
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
|
|
106
106
|
|
|
107
107
|
Args:
|
|
@@ -113,7 +113,7 @@ class Qwen3VLProcessor(ProcessorMixin):
|
|
|
113
113
|
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
|
114
114
|
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
|
115
115
|
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
|
116
|
-
The
|
|
116
|
+
The video or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
|
117
117
|
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
|
118
118
|
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
119
119
|
If set, will return tensors of a particular framework. Acceptable values are:
|