transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -38,6 +38,7 @@ from ...utils import (
|
|
|
38
38
|
logging,
|
|
39
39
|
torch_float,
|
|
40
40
|
)
|
|
41
|
+
from ...utils.generic import maybe_autocast
|
|
41
42
|
from .configuration_imagegpt import ImageGPTConfig
|
|
42
43
|
|
|
43
44
|
|
|
@@ -60,7 +61,7 @@ class ImageGPTLayerNorm(nn.Module):
|
|
|
60
61
|
class ImageGPTAttention(nn.Module):
|
|
61
62
|
def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx: Optional[int] = None):
|
|
62
63
|
super().__init__()
|
|
63
|
-
|
|
64
|
+
self.config = config
|
|
64
65
|
max_positions = config.max_position_embeddings
|
|
65
66
|
self.register_buffer(
|
|
66
67
|
"bias",
|
|
@@ -69,7 +70,6 @@ class ImageGPTAttention(nn.Module):
|
|
|
69
70
|
),
|
|
70
71
|
persistent=False,
|
|
71
72
|
)
|
|
72
|
-
self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
|
|
73
73
|
|
|
74
74
|
self.embed_dim = config.hidden_size
|
|
75
75
|
self.num_heads = config.num_attention_heads
|
|
@@ -150,7 +150,7 @@ class ImageGPTAttention(nn.Module):
|
|
|
150
150
|
scale_factor /= float(self.layer_idx + 1)
|
|
151
151
|
|
|
152
152
|
# Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
|
|
153
|
-
with
|
|
153
|
+
with maybe_autocast(query.device.type, enabled=False):
|
|
154
154
|
q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
|
|
155
155
|
attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
|
|
156
156
|
attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
|
|
@@ -383,6 +383,14 @@ class ImageGPTPreTrainedModel(PreTrainedModel):
|
|
|
383
383
|
if "c_proj" in name and "weight" in name:
|
|
384
384
|
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
|
385
385
|
init.normal_(p, mean=0.0, std=self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
|
|
386
|
+
elif isinstance(module, ImageGPTAttention):
|
|
387
|
+
max_positions = module.config.max_position_embeddings
|
|
388
|
+
init.copy_(
|
|
389
|
+
module.bias,
|
|
390
|
+
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
|
|
391
|
+
1, 1, max_positions, max_positions
|
|
392
|
+
),
|
|
393
|
+
)
|
|
386
394
|
|
|
387
395
|
|
|
388
396
|
@auto_docstring
|
|
@@ -879,6 +879,7 @@ class InformerEncoder(InformerPreTrainedModel):
|
|
|
879
879
|
output_attentions: Optional[bool] = None,
|
|
880
880
|
output_hidden_states: Optional[bool] = None,
|
|
881
881
|
return_dict: Optional[bool] = None,
|
|
882
|
+
**kwargs,
|
|
882
883
|
) -> Union[tuple, BaseModelOutput]:
|
|
883
884
|
r"""
|
|
884
885
|
Args:
|
|
@@ -998,6 +999,7 @@ class InformerDecoder(InformerPreTrainedModel):
|
|
|
998
999
|
output_hidden_states: Optional[bool] = None,
|
|
999
1000
|
return_dict: Optional[bool] = None,
|
|
1000
1001
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1002
|
+
**kwargs,
|
|
1001
1003
|
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
|
1002
1004
|
r"""
|
|
1003
1005
|
Args:
|
|
@@ -1296,6 +1298,7 @@ class InformerModel(InformerPreTrainedModel):
|
|
|
1296
1298
|
use_cache: Optional[bool] = None,
|
|
1297
1299
|
return_dict: Optional[bool] = None,
|
|
1298
1300
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1301
|
+
**kwargs,
|
|
1299
1302
|
) -> Union[Seq2SeqTSModelOutput, tuple]:
|
|
1300
1303
|
r"""
|
|
1301
1304
|
past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
|
|
@@ -1573,6 +1576,7 @@ class InformerForPrediction(InformerPreTrainedModel):
|
|
|
1573
1576
|
use_cache: Optional[bool] = None,
|
|
1574
1577
|
return_dict: Optional[bool] = None,
|
|
1575
1578
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1579
|
+
**kwargs,
|
|
1576
1580
|
) -> Union[Seq2SeqTSModelOutput, tuple]:
|
|
1577
1581
|
r"""
|
|
1578
1582
|
past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`):
|
|
@@ -415,6 +415,7 @@ class InformerEncoder(TimeSeriesTransformerEncoder):
|
|
|
415
415
|
output_attentions: Optional[bool] = None,
|
|
416
416
|
output_hidden_states: Optional[bool] = None,
|
|
417
417
|
return_dict: Optional[bool] = None,
|
|
418
|
+
**kwargs,
|
|
418
419
|
) -> Union[tuple, BaseModelOutput]:
|
|
419
420
|
r"""
|
|
420
421
|
Args:
|
|
@@ -335,6 +335,8 @@ class InstructBlipPreTrainedModel(PreTrainedModel):
|
|
|
335
335
|
init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
|
|
336
336
|
elif isinstance(module, (InstructBlipForConditionalGeneration, InstructBlipModel)):
|
|
337
337
|
init.zeros_(module.query_tokens)
|
|
338
|
+
elif isinstance(module, InstructBlipQFormerEmbeddings):
|
|
339
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
338
340
|
|
|
339
341
|
|
|
340
342
|
# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->InstructBlip
|
|
@@ -128,6 +128,56 @@ class InstructBlipVideoVisionEmbeddings(nn.Module):
|
|
|
128
128
|
return embeddings
|
|
129
129
|
|
|
130
130
|
|
|
131
|
+
class InstructBlipVideoQFormerEmbeddings(nn.Module):
|
|
132
|
+
"""Construct the embeddings from word and position embeddings."""
|
|
133
|
+
|
|
134
|
+
def __init__(self, config):
|
|
135
|
+
super().__init__()
|
|
136
|
+
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
|
|
137
|
+
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
|
138
|
+
|
|
139
|
+
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
140
|
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
141
|
+
|
|
142
|
+
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
|
143
|
+
self.register_buffer(
|
|
144
|
+
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
self.config = config
|
|
148
|
+
|
|
149
|
+
def forward(
|
|
150
|
+
self,
|
|
151
|
+
input_ids=None,
|
|
152
|
+
position_ids=None,
|
|
153
|
+
query_embeds=None,
|
|
154
|
+
past_key_values_length=0,
|
|
155
|
+
):
|
|
156
|
+
if input_ids is not None:
|
|
157
|
+
seq_length = input_ids.size()[1]
|
|
158
|
+
else:
|
|
159
|
+
seq_length = 0
|
|
160
|
+
|
|
161
|
+
if position_ids is None:
|
|
162
|
+
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
|
|
163
|
+
|
|
164
|
+
if input_ids is not None:
|
|
165
|
+
embeddings = self.word_embeddings(input_ids)
|
|
166
|
+
|
|
167
|
+
position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
|
|
168
|
+
embeddings = embeddings + position_embeddings
|
|
169
|
+
|
|
170
|
+
if query_embeds is not None:
|
|
171
|
+
embeddings = torch.cat((query_embeds, embeddings), dim=1)
|
|
172
|
+
else:
|
|
173
|
+
embeddings = query_embeds
|
|
174
|
+
|
|
175
|
+
embeddings = embeddings.to(self.layernorm.weight.dtype)
|
|
176
|
+
embeddings = self.layernorm(embeddings)
|
|
177
|
+
embeddings = self.dropout(embeddings)
|
|
178
|
+
return embeddings
|
|
179
|
+
|
|
180
|
+
|
|
131
181
|
@auto_docstring
|
|
132
182
|
class InstructBlipVideoPreTrainedModel(PreTrainedModel):
|
|
133
183
|
config: InstructBlipVideoConfig
|
|
@@ -158,6 +208,8 @@ class InstructBlipVideoPreTrainedModel(PreTrainedModel):
|
|
|
158
208
|
init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
|
|
159
209
|
elif isinstance(module, (InstructBlipVideoForConditionalGeneration, InstructBlipVideoModel)):
|
|
160
210
|
init.zeros_(module.query_tokens)
|
|
211
|
+
elif isinstance(module, InstructBlipVideoQFormerEmbeddings):
|
|
212
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
161
213
|
|
|
162
214
|
|
|
163
215
|
# Adapted from transformers.models.siglip.modeling_siglip.eager_attention_forward -> InstructBlipVideo doesn't cast attn weights to fp32
|
|
@@ -677,56 +729,6 @@ class InstructBlipVideoQFormerEncoder(nn.Module):
|
|
|
677
729
|
)
|
|
678
730
|
|
|
679
731
|
|
|
680
|
-
class InstructBlipVideoQFormerEmbeddings(nn.Module):
|
|
681
|
-
"""Construct the embeddings from word and position embeddings."""
|
|
682
|
-
|
|
683
|
-
def __init__(self, config):
|
|
684
|
-
super().__init__()
|
|
685
|
-
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
|
|
686
|
-
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
|
687
|
-
|
|
688
|
-
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
689
|
-
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
690
|
-
|
|
691
|
-
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
|
692
|
-
self.register_buffer(
|
|
693
|
-
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
|
|
694
|
-
)
|
|
695
|
-
|
|
696
|
-
self.config = config
|
|
697
|
-
|
|
698
|
-
def forward(
|
|
699
|
-
self,
|
|
700
|
-
input_ids=None,
|
|
701
|
-
position_ids=None,
|
|
702
|
-
query_embeds=None,
|
|
703
|
-
past_key_values_length=0,
|
|
704
|
-
):
|
|
705
|
-
if input_ids is not None:
|
|
706
|
-
seq_length = input_ids.size()[1]
|
|
707
|
-
else:
|
|
708
|
-
seq_length = 0
|
|
709
|
-
|
|
710
|
-
if position_ids is None:
|
|
711
|
-
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
|
|
712
|
-
|
|
713
|
-
if input_ids is not None:
|
|
714
|
-
embeddings = self.word_embeddings(input_ids)
|
|
715
|
-
|
|
716
|
-
position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
|
|
717
|
-
embeddings = embeddings + position_embeddings
|
|
718
|
-
|
|
719
|
-
if query_embeds is not None:
|
|
720
|
-
embeddings = torch.cat((query_embeds, embeddings), dim=1)
|
|
721
|
-
else:
|
|
722
|
-
embeddings = query_embeds
|
|
723
|
-
|
|
724
|
-
embeddings = embeddings.to(self.layernorm.weight.dtype)
|
|
725
|
-
embeddings = self.layernorm(embeddings)
|
|
726
|
-
embeddings = self.dropout(embeddings)
|
|
727
|
-
return embeddings
|
|
728
|
-
|
|
729
|
-
|
|
730
732
|
class InstructBlipVideoQFormerModel(InstructBlipVideoPreTrainedModel):
|
|
731
733
|
"""
|
|
732
734
|
Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
|
|
@@ -84,7 +84,6 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
|
|
|
84
84
|
processed_videos_grouped[shape] = stacked_videos
|
|
85
85
|
|
|
86
86
|
processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
|
|
87
|
-
processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
|
|
88
87
|
|
|
89
88
|
return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)
|
|
90
89
|
|
|
@@ -208,11 +208,10 @@ class InternVLVisionPatchEmbeddings(nn.Module):
|
|
|
208
208
|
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
|
|
209
209
|
)
|
|
210
210
|
|
|
211
|
-
embeddings = self.projection(pixel_values)
|
|
212
|
-
patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
|
|
211
|
+
embeddings = self.projection(pixel_values.to(self.projection.weight.dtype))
|
|
213
212
|
embeddings = embeddings.flatten(2).transpose(1, 2)
|
|
214
213
|
|
|
215
|
-
return embeddings
|
|
214
|
+
return embeddings
|
|
216
215
|
|
|
217
216
|
|
|
218
217
|
# Based on timm implementation, which can be found here:
|
|
@@ -291,7 +290,7 @@ class InternVLVisionEmbeddings(nn.Module):
|
|
|
291
290
|
bool_masked_pos: Optional[torch.BoolTensor] = None,
|
|
292
291
|
) -> torch.Tensor:
|
|
293
292
|
_, _, height, width = pixel_values.shape
|
|
294
|
-
embeddings
|
|
293
|
+
embeddings = self.patch_embeddings(pixel_values)
|
|
295
294
|
batch_size, seq_len, _ = embeddings.size()
|
|
296
295
|
|
|
297
296
|
if bool_masked_pos is not None:
|
|
@@ -308,7 +307,7 @@ class InternVLVisionEmbeddings(nn.Module):
|
|
|
308
307
|
|
|
309
308
|
embeddings = self.dropout(embeddings)
|
|
310
309
|
|
|
311
|
-
return embeddings
|
|
310
|
+
return embeddings
|
|
312
311
|
|
|
313
312
|
|
|
314
313
|
class InternVLVisionMLP(nn.Module):
|
|
@@ -449,15 +448,13 @@ class InternVLVisionModel(InternVLVisionPreTrainedModel):
|
|
|
449
448
|
@check_model_inputs(tie_last_hidden_states=False)
|
|
450
449
|
@auto_docstring
|
|
451
450
|
def forward(
|
|
452
|
-
self,
|
|
453
|
-
pixel_values: torch.Tensor,
|
|
454
|
-
bool_masked_pos: Optional[torch.BoolTensor] = None,
|
|
451
|
+
self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None, **kwargs
|
|
455
452
|
) -> Union[tuple, InternVLVisionModelOutputWithPooling]:
|
|
456
453
|
r"""
|
|
457
454
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
|
|
458
455
|
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
|
|
459
456
|
"""
|
|
460
|
-
embedding_output
|
|
457
|
+
embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
|
|
461
458
|
|
|
462
459
|
encoder_outputs = self.encoder(embedding_output)
|
|
463
460
|
sequence_output = encoder_outputs[0]
|
|
@@ -900,6 +897,7 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
|
|
|
900
897
|
attention_mask=None,
|
|
901
898
|
cache_position=None,
|
|
902
899
|
logits_to_keep=None,
|
|
900
|
+
is_first_iteration=False,
|
|
903
901
|
**kwargs,
|
|
904
902
|
):
|
|
905
903
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -911,12 +909,15 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
|
|
|
911
909
|
attention_mask=attention_mask,
|
|
912
910
|
cache_position=cache_position,
|
|
913
911
|
logits_to_keep=logits_to_keep,
|
|
912
|
+
is_first_iteration=is_first_iteration,
|
|
914
913
|
**kwargs,
|
|
915
914
|
)
|
|
916
915
|
|
|
917
|
-
if
|
|
918
|
-
#
|
|
919
|
-
#
|
|
916
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
917
|
+
# Pixel values are used only in the first iteration if available
|
|
918
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
919
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
920
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
920
921
|
model_inputs["pixel_values"] = pixel_values
|
|
921
922
|
|
|
922
923
|
return model_inputs
|
|
@@ -29,7 +29,7 @@ from ...modeling_layers import GradientCheckpointingLayer
|
|
|
29
29
|
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
|
|
30
30
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
31
31
|
from ...processing_utils import Unpack
|
|
32
|
-
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple,
|
|
32
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_int
|
|
33
33
|
from ...utils.generic import check_model_inputs
|
|
34
34
|
from ..clip.modeling_clip import CLIPMLP
|
|
35
35
|
from ..janus.modeling_janus import JanusVisionAttention
|
|
@@ -44,9 +44,6 @@ from ..llava.modeling_llava import (
|
|
|
44
44
|
from .configuration_internvl import InternVLConfig, InternVLVisionConfig
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
logger = logging.get_logger(__name__)
|
|
48
|
-
|
|
49
|
-
|
|
50
47
|
def eager_attention_forward(
|
|
51
48
|
module: nn.Module,
|
|
52
49
|
query: torch.Tensor,
|
|
@@ -176,11 +173,10 @@ class InternVLVisionPatchEmbeddings(nn.Module):
|
|
|
176
173
|
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
|
|
177
174
|
)
|
|
178
175
|
|
|
179
|
-
embeddings = self.projection(pixel_values)
|
|
180
|
-
patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
|
|
176
|
+
embeddings = self.projection(pixel_values.to(self.projection.weight.dtype))
|
|
181
177
|
embeddings = embeddings.flatten(2).transpose(1, 2)
|
|
182
178
|
|
|
183
|
-
return embeddings
|
|
179
|
+
return embeddings
|
|
184
180
|
|
|
185
181
|
|
|
186
182
|
# Based on timm implementation, which can be found here:
|
|
@@ -259,7 +255,7 @@ class InternVLVisionEmbeddings(nn.Module):
|
|
|
259
255
|
bool_masked_pos: Optional[torch.BoolTensor] = None,
|
|
260
256
|
) -> torch.Tensor:
|
|
261
257
|
_, _, height, width = pixel_values.shape
|
|
262
|
-
embeddings
|
|
258
|
+
embeddings = self.patch_embeddings(pixel_values)
|
|
263
259
|
batch_size, seq_len, _ = embeddings.size()
|
|
264
260
|
|
|
265
261
|
if bool_masked_pos is not None:
|
|
@@ -276,7 +272,7 @@ class InternVLVisionEmbeddings(nn.Module):
|
|
|
276
272
|
|
|
277
273
|
embeddings = self.dropout(embeddings)
|
|
278
274
|
|
|
279
|
-
return embeddings
|
|
275
|
+
return embeddings
|
|
280
276
|
|
|
281
277
|
|
|
282
278
|
class InternVLVisionMLP(CLIPMLP):
|
|
@@ -406,15 +402,13 @@ class InternVLVisionModel(InternVLVisionPreTrainedModel):
|
|
|
406
402
|
@check_model_inputs(tie_last_hidden_states=False)
|
|
407
403
|
@auto_docstring
|
|
408
404
|
def forward(
|
|
409
|
-
self,
|
|
410
|
-
pixel_values: torch.Tensor,
|
|
411
|
-
bool_masked_pos: Optional[torch.BoolTensor] = None,
|
|
405
|
+
self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None, **kwargs
|
|
412
406
|
) -> Union[tuple, InternVLVisionModelOutputWithPooling]:
|
|
413
407
|
r"""
|
|
414
408
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
|
|
415
409
|
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
|
|
416
410
|
"""
|
|
417
|
-
embedding_output
|
|
411
|
+
embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
|
|
418
412
|
|
|
419
413
|
encoder_outputs = self.encoder(embedding_output)
|
|
420
414
|
sequence_output = encoder_outputs[0]
|
|
@@ -140,7 +140,6 @@ class InternVLVideoProcessor(BaseVideoProcessor):
|
|
|
140
140
|
processed_videos_grouped[shape] = stacked_videos
|
|
141
141
|
|
|
142
142
|
processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
|
|
143
|
-
processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
|
|
144
143
|
|
|
145
144
|
return BatchFeature(data={"pixel_values_videos": processed_videos}, tensor_type=return_tensors)
|
|
146
145
|
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
from ...utils import _LazyModule
|
|
17
|
+
from ...utils.import_utils import define_import_structure
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from .configuration_jais2 import *
|
|
22
|
+
from .modeling_jais2 import *
|
|
23
|
+
else:
|
|
24
|
+
import sys
|
|
25
|
+
|
|
26
|
+
_file = globals()["__file__"]
|
|
27
|
+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
2
|
+
# This file was automatically generated from src/transformers/models/jais2/modular_jais2.py.
|
|
3
|
+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
|
4
|
+
# the file from the modular. If any change should be done, please apply the change to the
|
|
5
|
+
# modular_jais2.py file directly. One of our CI enforces this.
|
|
6
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
7
|
+
# coding=utf-8
|
|
8
|
+
# Copyright 2025 the HuggingFace Team. All rights reserved.
|
|
9
|
+
#
|
|
10
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
11
|
+
# you may not use this file except in compliance with the License.
|
|
12
|
+
# You may obtain a copy of the License at
|
|
13
|
+
#
|
|
14
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
15
|
+
#
|
|
16
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
17
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
18
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
19
|
+
# See the License for the specific language governing permissions and
|
|
20
|
+
# limitations under the License.
|
|
21
|
+
|
|
22
|
+
from typing import Optional
|
|
23
|
+
|
|
24
|
+
from ...configuration_utils import PreTrainedConfig
|
|
25
|
+
from ...modeling_rope_utils import RopeParameters
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Jais2Config(PreTrainedConfig):
|
|
29
|
+
r"""
|
|
30
|
+
This is the configuration class to store the configuration of a [`Jais2Model`]. It is used to instantiate a Jais2
|
|
31
|
+
model according to the specified arguments, defining the model architecture.
|
|
32
|
+
[inceptionai/Jais-2-8B-Chat](https://huggingface.co/inceptionai/Jais-2-8B-Chat).
|
|
33
|
+
|
|
34
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
35
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
vocab_size (`int`, *optional*, defaults to 150272):
|
|
39
|
+
Vocabulary size of the Jais2 model.
|
|
40
|
+
hidden_size (`int`, *optional*, defaults to 3328):
|
|
41
|
+
Dimension of the hidden representations.
|
|
42
|
+
intermediate_size (`int`, *optional*, defaults to 26624):
|
|
43
|
+
Dimension of the MLP representations.
|
|
44
|
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
|
45
|
+
Number of hidden layers in the Transformer decoder.
|
|
46
|
+
num_attention_heads (`int`, *optional*, defaults to 26):
|
|
47
|
+
Number of attention heads for each attention layer.
|
|
48
|
+
num_key_value_heads (`int`, *optional*):
|
|
49
|
+
Number of key_value heads for Grouped Query Attention.
|
|
50
|
+
hidden_act (`str`, *optional*, defaults to `"relu2"`):
|
|
51
|
+
The non-linear activation function in the decoder.
|
|
52
|
+
max_position_embeddings (`int`, *optional*, defaults to 8192):
|
|
53
|
+
The maximum sequence length.
|
|
54
|
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
55
|
+
The standard deviation of the truncated_normal_initializer.
|
|
56
|
+
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
|
57
|
+
The epsilon used by the normalization layers.
|
|
58
|
+
use_cache (`bool`, *optional*, defaults to `True`):
|
|
59
|
+
Whether to return last key/values attentions.
|
|
60
|
+
pad_token_id (`int`, *optional*):
|
|
61
|
+
Padding token id.
|
|
62
|
+
bos_token_id (`int`, *optional*, defaults to 0):
|
|
63
|
+
Beginning of stream token id.
|
|
64
|
+
eos_token_id (`int`, *optional*, defaults to 150024):
|
|
65
|
+
End of stream token id.
|
|
66
|
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
|
67
|
+
Whether to tie weight embeddings.
|
|
68
|
+
attention_bias (`bool`, *optional*, defaults to `True`):
|
|
69
|
+
Whether to use a bias in the query, key, value and output projection layers.
|
|
70
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
71
|
+
The dropout ratio for the attention probabilities.
|
|
72
|
+
mlp_bias (`bool`, *optional*, defaults to `True`):
|
|
73
|
+
Whether to use a bias in up_proj, down_proj and gate_proj layers.
|
|
74
|
+
head_dim (`int`, *optional*):
|
|
75
|
+
The attention head dimension.
|
|
76
|
+
rope_parameters (`dict`, *optional*):
|
|
77
|
+
The RoPE parameters.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
model_type = "jais2"
|
|
81
|
+
keys_to_ignore_at_inference = ["past_key_values"]
|
|
82
|
+
|
|
83
|
+
base_model_tp_plan = {
|
|
84
|
+
"layers.*.self_attn.q_proj": "colwise",
|
|
85
|
+
"layers.*.self_attn.k_proj": "colwise",
|
|
86
|
+
"layers.*.self_attn.v_proj": "colwise",
|
|
87
|
+
"layers.*.self_attn.o_proj": "rowwise",
|
|
88
|
+
"layers.*.mlp.up_proj": "colwise",
|
|
89
|
+
"layers.*.mlp.down_proj": "rowwise",
|
|
90
|
+
}
|
|
91
|
+
base_model_pp_plan = {
|
|
92
|
+
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
|
93
|
+
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
|
94
|
+
"norm": (["hidden_states"], ["hidden_states"]),
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
vocab_size: Optional[int] = 150272,
|
|
100
|
+
hidden_size: Optional[int] = 3328,
|
|
101
|
+
intermediate_size: Optional[int] = 26624,
|
|
102
|
+
num_hidden_layers: Optional[int] = 32,
|
|
103
|
+
num_attention_heads: Optional[int] = 26,
|
|
104
|
+
num_key_value_heads: Optional[int] = None,
|
|
105
|
+
hidden_act: Optional[str] = "relu2",
|
|
106
|
+
max_position_embeddings: Optional[int] = 8192,
|
|
107
|
+
initializer_range: Optional[float] = 0.02,
|
|
108
|
+
layer_norm_eps: Optional[float] = 1e-5,
|
|
109
|
+
use_cache: Optional[bool] = True,
|
|
110
|
+
pad_token_id: Optional[int] = None,
|
|
111
|
+
bos_token_id: Optional[int] = 0,
|
|
112
|
+
eos_token_id: Optional[int] = 150024,
|
|
113
|
+
tie_word_embeddings: Optional[bool] = False,
|
|
114
|
+
attention_bias: Optional[bool] = True,
|
|
115
|
+
attention_dropout: Optional[float] = 0.0,
|
|
116
|
+
mlp_bias: Optional[bool] = True,
|
|
117
|
+
head_dim: Optional[int] = None,
|
|
118
|
+
rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
|
|
119
|
+
**kwargs,
|
|
120
|
+
):
|
|
121
|
+
self.vocab_size = vocab_size
|
|
122
|
+
self.max_position_embeddings = max_position_embeddings
|
|
123
|
+
self.hidden_size = hidden_size
|
|
124
|
+
self.intermediate_size = intermediate_size
|
|
125
|
+
self.num_hidden_layers = num_hidden_layers
|
|
126
|
+
self.num_attention_heads = num_attention_heads
|
|
127
|
+
|
|
128
|
+
# for backward compatibility
|
|
129
|
+
if num_key_value_heads is None:
|
|
130
|
+
num_key_value_heads = num_attention_heads
|
|
131
|
+
|
|
132
|
+
self.num_key_value_heads = num_key_value_heads
|
|
133
|
+
self.hidden_act = hidden_act
|
|
134
|
+
self.initializer_range = initializer_range
|
|
135
|
+
self.use_cache = use_cache
|
|
136
|
+
self.attention_bias = attention_bias
|
|
137
|
+
self.attention_dropout = attention_dropout
|
|
138
|
+
self.mlp_bias = mlp_bias
|
|
139
|
+
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
|
|
140
|
+
self.rope_parameters = rope_parameters
|
|
141
|
+
|
|
142
|
+
super().__init__(
|
|
143
|
+
pad_token_id=pad_token_id,
|
|
144
|
+
bos_token_id=bos_token_id,
|
|
145
|
+
eos_token_id=eos_token_id,
|
|
146
|
+
tie_word_embeddings=tie_word_embeddings,
|
|
147
|
+
**kwargs,
|
|
148
|
+
)
|
|
149
|
+
self.layer_norm_eps = layer_norm_eps
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
__all__ = ["Jais2Config"]
|