transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
|
-
from typing import Optional
|
|
16
|
+
from typing import Optional, Union
|
|
17
17
|
|
|
18
18
|
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
|
|
19
19
|
from tokenizers.models import BPE
|
|
@@ -54,19 +54,20 @@ class HerbertTokenizer(TokenizersBackend):
|
|
|
54
54
|
The mask token.
|
|
55
55
|
sep_token (`str`, *optional*, defaults to `"</s>"`):
|
|
56
56
|
The separator token.
|
|
57
|
-
vocab (`dict`, *optional*):
|
|
57
|
+
vocab (`str`, `dict` or `list`, *optional*):
|
|
58
58
|
Custom vocabulary dictionary.
|
|
59
|
-
merges (`list`, *optional*):
|
|
59
|
+
merges (`str` or `list[str]`, *optional*):
|
|
60
60
|
Custom merges list.
|
|
61
61
|
"""
|
|
62
62
|
|
|
63
63
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
64
|
-
|
|
64
|
+
model_input_names = ["input_ids", "attention_mask"]
|
|
65
|
+
model = BPE
|
|
65
66
|
|
|
66
67
|
def __init__(
|
|
67
68
|
self,
|
|
68
|
-
vocab: Optional[dict] = None,
|
|
69
|
-
merges: Optional[list] = None,
|
|
69
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
70
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
70
71
|
cls_token: str = "<s>",
|
|
71
72
|
unk_token: str = "<unk>",
|
|
72
73
|
pad_token: str = "<pad>",
|
|
@@ -76,19 +77,8 @@ class HerbertTokenizer(TokenizersBackend):
|
|
|
76
77
|
merges_file: Optional[str] = None,
|
|
77
78
|
**kwargs,
|
|
78
79
|
):
|
|
79
|
-
if vocab is not None:
|
|
80
|
-
|
|
81
|
-
{token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
|
|
82
|
-
)
|
|
83
|
-
else:
|
|
84
|
-
self._vocab = {}
|
|
85
|
-
|
|
86
|
-
if merges is not None:
|
|
87
|
-
# Convert lists to tuples if necessary (happens when loading from JSON)
|
|
88
|
-
self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
|
|
89
|
-
else:
|
|
90
|
-
self._merges = []
|
|
91
|
-
|
|
80
|
+
self._vocab = vocab if vocab is not None else {str(unk_token): 0}
|
|
81
|
+
self._merges = merges or []
|
|
92
82
|
self._tokenizer = Tokenizer(
|
|
93
83
|
BPE(
|
|
94
84
|
vocab=self._vocab,
|
|
@@ -105,13 +95,7 @@ class HerbertTokenizer(TokenizersBackend):
|
|
|
105
95
|
self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
|
106
96
|
self._tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")
|
|
107
97
|
|
|
108
|
-
tokenizer_object = self._tokenizer
|
|
109
|
-
|
|
110
|
-
self.vocab_file = vocab_file
|
|
111
|
-
self.merges_file = merges_file
|
|
112
|
-
|
|
113
98
|
super().__init__(
|
|
114
|
-
tokenizer_object=tokenizer_object,
|
|
115
99
|
cls_token=cls_token,
|
|
116
100
|
unk_token=unk_token,
|
|
117
101
|
pad_token=pad_token,
|
|
@@ -26,6 +26,7 @@ import torch
|
|
|
26
26
|
import torch.nn.functional as F
|
|
27
27
|
from torch import Tensor, nn
|
|
28
28
|
|
|
29
|
+
from ... import initialization as init
|
|
29
30
|
from ...activations import ACT2FN
|
|
30
31
|
from ...modeling_outputs import BackboneOutput, BaseModelOutputWithNoAttention, ImageClassifierOutputWithNoAttention
|
|
31
32
|
from ...modeling_utils import PreTrainedModel
|
|
@@ -45,6 +46,15 @@ class HGNetV2PreTrainedModel(PreTrainedModel):
|
|
|
45
46
|
input_modalities = ("image",)
|
|
46
47
|
_no_split_modules = ["HGNetV2BasicLayer"]
|
|
47
48
|
|
|
49
|
+
def _init_weights(self, module):
|
|
50
|
+
super()._init_weights(module)
|
|
51
|
+
# We need to check it like that as d_fine models replace the BatchNorm2d by their own
|
|
52
|
+
if "BatchNorm" in module.__class__.__name__:
|
|
53
|
+
init.ones_(module.weight)
|
|
54
|
+
init.zeros_(module.bias)
|
|
55
|
+
init.zeros_(module.running_mean)
|
|
56
|
+
init.ones_(module.running_var)
|
|
57
|
+
|
|
48
58
|
|
|
49
59
|
class HGNetV2LearnableAffineBlock(nn.Module):
|
|
50
60
|
def __init__(self, scale_value: float = 1.0, bias_value: float = 0.0):
|
|
@@ -347,7 +357,11 @@ class HGNetV2Backbone(HGNetV2PreTrainedModel, BackboneMixin):
|
|
|
347
357
|
|
|
348
358
|
@auto_docstring
|
|
349
359
|
def forward(
|
|
350
|
-
self,
|
|
360
|
+
self,
|
|
361
|
+
pixel_values: Tensor,
|
|
362
|
+
output_hidden_states: Optional[bool] = None,
|
|
363
|
+
return_dict: Optional[bool] = None,
|
|
364
|
+
**kwargs,
|
|
351
365
|
) -> BackboneOutput:
|
|
352
366
|
r"""
|
|
353
367
|
Examples:
|
|
@@ -426,6 +440,7 @@ class HGNetV2ForImageClassification(HGNetV2PreTrainedModel):
|
|
|
426
440
|
labels: Optional[torch.LongTensor] = None,
|
|
427
441
|
output_hidden_states: Optional[bool] = None,
|
|
428
442
|
return_dict: Optional[bool] = None,
|
|
443
|
+
**kwargs,
|
|
429
444
|
) -> ImageClassifierOutputWithNoAttention:
|
|
430
445
|
r"""
|
|
431
446
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -20,6 +20,7 @@ import torch
|
|
|
20
20
|
import torch.nn.functional as F
|
|
21
21
|
from torch import Tensor, nn
|
|
22
22
|
|
|
23
|
+
from ... import initialization as init
|
|
23
24
|
from ...configuration_utils import PreTrainedConfig
|
|
24
25
|
from ...modeling_outputs import (
|
|
25
26
|
BackboneOutput,
|
|
@@ -170,6 +171,15 @@ class HGNetV2PreTrainedModel(PreTrainedModel):
|
|
|
170
171
|
input_modalities = ("image",)
|
|
171
172
|
_no_split_modules = ["HGNetV2BasicLayer"]
|
|
172
173
|
|
|
174
|
+
def _init_weights(self, module):
|
|
175
|
+
super()._init_weights(module)
|
|
176
|
+
# We need to check it like that as d_fine models replace the BatchNorm2d by their own
|
|
177
|
+
if "BatchNorm" in module.__class__.__name__:
|
|
178
|
+
init.ones_(module.weight)
|
|
179
|
+
init.zeros_(module.bias)
|
|
180
|
+
init.zeros_(module.running_mean)
|
|
181
|
+
init.ones_(module.running_var)
|
|
182
|
+
|
|
173
183
|
|
|
174
184
|
class HGNetV2LearnableAffineBlock(nn.Module):
|
|
175
185
|
def __init__(self, scale_value: float = 1.0, bias_value: float = 0.0):
|
|
@@ -470,7 +480,11 @@ class HGNetV2Backbone(HGNetV2PreTrainedModel, BackboneMixin):
|
|
|
470
480
|
|
|
471
481
|
@auto_docstring
|
|
472
482
|
def forward(
|
|
473
|
-
self,
|
|
483
|
+
self,
|
|
484
|
+
pixel_values: Tensor,
|
|
485
|
+
output_hidden_states: Optional[bool] = None,
|
|
486
|
+
return_dict: Optional[bool] = None,
|
|
487
|
+
**kwargs,
|
|
474
488
|
) -> BackboneOutput:
|
|
475
489
|
r"""
|
|
476
490
|
Examples:
|
|
@@ -549,6 +563,7 @@ class HGNetV2ForImageClassification(HGNetV2PreTrainedModel):
|
|
|
549
563
|
labels: Optional[torch.LongTensor] = None,
|
|
550
564
|
output_hidden_states: Optional[bool] = None,
|
|
551
565
|
return_dict: Optional[bool] = None,
|
|
566
|
+
**kwargs,
|
|
552
567
|
) -> ImageClassifierOutputWithNoAttention:
|
|
553
568
|
r"""
|
|
554
569
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -848,6 +848,7 @@ class HieraModel(HieraPreTrainedModel):
|
|
|
848
848
|
output_hidden_states: Optional[bool] = None,
|
|
849
849
|
interpolate_pos_encoding: Optional[bool] = None,
|
|
850
850
|
return_dict: Optional[bool] = None,
|
|
851
|
+
**kwargs,
|
|
851
852
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
852
853
|
r"""
|
|
853
854
|
noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*):
|
|
@@ -1132,6 +1133,7 @@ class HieraForPreTraining(HieraPreTrainedModel):
|
|
|
1132
1133
|
output_hidden_states: Optional[bool] = None,
|
|
1133
1134
|
interpolate_pos_encoding: Optional[bool] = None,
|
|
1134
1135
|
return_dict: Optional[bool] = None,
|
|
1136
|
+
**kwargs,
|
|
1135
1137
|
) -> Union[tuple, HieraForPreTrainingOutput]:
|
|
1136
1138
|
r"""
|
|
1137
1139
|
noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*):
|
|
@@ -1249,6 +1251,7 @@ class HieraForImageClassification(HieraPreTrainedModel):
|
|
|
1249
1251
|
output_hidden_states: Optional[bool] = None,
|
|
1250
1252
|
interpolate_pos_encoding: Optional[bool] = None,
|
|
1251
1253
|
return_dict: Optional[bool] = None,
|
|
1254
|
+
**kwargs,
|
|
1252
1255
|
) -> Union[tuple, HieraForImageClassificationOutput]:
|
|
1253
1256
|
r"""
|
|
1254
1257
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1325,6 +1328,7 @@ class HieraBackbone(HieraPreTrainedModel, BackboneMixin):
|
|
|
1325
1328
|
output_hidden_states: Optional[bool] = None,
|
|
1326
1329
|
output_attentions: Optional[bool] = None,
|
|
1327
1330
|
return_dict: Optional[bool] = None,
|
|
1331
|
+
**kwargs,
|
|
1328
1332
|
) -> BackboneOutput:
|
|
1329
1333
|
"""
|
|
1330
1334
|
Returns:
|
|
@@ -648,6 +648,10 @@ class HubertPreTrainedModel(PreTrainedModel):
|
|
|
648
648
|
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm1d)):
|
|
649
649
|
init.zeros_(module.bias)
|
|
650
650
|
init.ones_(module.weight)
|
|
651
|
+
if getattr(module, "running_mean", None) is not None:
|
|
652
|
+
init.zeros_(module.running_mean)
|
|
653
|
+
init.ones_(module.running_var)
|
|
654
|
+
init.zeros_(module.num_batches_tracked)
|
|
651
655
|
elif isinstance(module, nn.Conv1d):
|
|
652
656
|
if is_deepspeed_zero3_enabled():
|
|
653
657
|
import deepspeed
|
|
@@ -892,6 +896,7 @@ class HubertModel(HubertPreTrainedModel):
|
|
|
892
896
|
output_attentions: Optional[bool] = None,
|
|
893
897
|
output_hidden_states: Optional[bool] = None,
|
|
894
898
|
return_dict: Optional[bool] = None,
|
|
899
|
+
**kwargs,
|
|
895
900
|
) -> Union[tuple, BaseModelOutput]:
|
|
896
901
|
r"""
|
|
897
902
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1038,6 +1043,7 @@ class HubertForCTC(HubertPreTrainedModel):
|
|
|
1038
1043
|
output_hidden_states: Optional[bool] = None,
|
|
1039
1044
|
return_dict: Optional[bool] = None,
|
|
1040
1045
|
labels: Optional[torch.Tensor] = None,
|
|
1046
|
+
**kwargs,
|
|
1041
1047
|
) -> Union[tuple, CausalLMOutput]:
|
|
1042
1048
|
r"""
|
|
1043
1049
|
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
|
@@ -1149,6 +1155,7 @@ class HubertForSequenceClassification(HubertPreTrainedModel):
|
|
|
1149
1155
|
output_hidden_states: Optional[bool] = None,
|
|
1150
1156
|
return_dict: Optional[bool] = None,
|
|
1151
1157
|
labels: Optional[torch.Tensor] = None,
|
|
1158
|
+
**kwargs,
|
|
1152
1159
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1153
1160
|
r"""
|
|
1154
1161
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -145,6 +145,10 @@ class HubertPreTrainedModel(PreTrainedModel):
|
|
|
145
145
|
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm1d)):
|
|
146
146
|
init.zeros_(module.bias)
|
|
147
147
|
init.ones_(module.weight)
|
|
148
|
+
if getattr(module, "running_mean", None) is not None:
|
|
149
|
+
init.zeros_(module.running_mean)
|
|
150
|
+
init.ones_(module.running_var)
|
|
151
|
+
init.zeros_(module.num_batches_tracked)
|
|
148
152
|
elif isinstance(module, nn.Conv1d):
|
|
149
153
|
if is_deepspeed_zero3_enabled():
|
|
150
154
|
import deepspeed
|
|
@@ -226,6 +230,7 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
|
|
|
226
230
|
output_attentions: Optional[bool] = None,
|
|
227
231
|
output_hidden_states: Optional[bool] = None,
|
|
228
232
|
return_dict: Optional[bool] = None,
|
|
233
|
+
**kwargs,
|
|
229
234
|
) -> Union[tuple, BaseModelOutput]:
|
|
230
235
|
r"""
|
|
231
236
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -30,7 +30,7 @@ from transformers.cache_utils import Cache
|
|
|
30
30
|
from ...activations import ACT2FN
|
|
31
31
|
from ...cache_utils import DynamicCache
|
|
32
32
|
from ...generation import GenerationMixin
|
|
33
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
|
|
33
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
34
34
|
from ...masking_utils import create_causal_mask
|
|
35
35
|
from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
|
|
36
36
|
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
|
@@ -38,7 +38,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
38
38
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
39
39
|
from ...processing_utils import Unpack
|
|
40
40
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
41
|
-
from ...utils.generic import check_model_inputs
|
|
41
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
42
42
|
from .configuration_hunyuan_v1_dense import HunYuanDenseV1Config
|
|
43
43
|
|
|
44
44
|
|
|
@@ -153,6 +153,7 @@ def eager_attention_forward(
|
|
|
153
153
|
return attn_output, attn_weights
|
|
154
154
|
|
|
155
155
|
|
|
156
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
156
157
|
class HunYuanDenseV1Attention(nn.Module):
|
|
157
158
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
158
159
|
|
|
@@ -178,7 +179,6 @@ class HunYuanDenseV1Attention(nn.Module):
|
|
|
178
179
|
self.o_proj = nn.Linear(
|
|
179
180
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
180
181
|
)
|
|
181
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
182
182
|
self.query_layernorm = HunYuanDenseV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
|
183
183
|
self.key_layernorm = HunYuanDenseV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
|
184
184
|
|
|
@@ -320,7 +320,7 @@ class HunYuanDenseV1RotaryEmbedding(nn.Module):
|
|
|
320
320
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
321
321
|
|
|
322
322
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
323
|
-
self.original_inv_freq =
|
|
323
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
324
324
|
|
|
325
325
|
@staticmethod
|
|
326
326
|
def compute_default_rope_parameters(
|
|
@@ -359,7 +359,7 @@ class HunYuanDenseV1RotaryEmbedding(nn.Module):
|
|
|
359
359
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
360
360
|
|
|
361
361
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
362
|
-
with
|
|
362
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
363
363
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
364
364
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
365
365
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -148,7 +148,7 @@ class HunYuanDenseV1RotaryEmbedding(LlamaRotaryEmbedding):
|
|
|
148
148
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
149
149
|
|
|
150
150
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
151
|
-
self.original_inv_freq =
|
|
151
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
152
152
|
|
|
153
153
|
|
|
154
154
|
class HunYuanDenseV1Model(LlamaModel):
|
|
@@ -30,15 +30,20 @@ from ... import initialization as init
|
|
|
30
30
|
from ...activations import ACT2FN
|
|
31
31
|
from ...cache_utils import Cache, DynamicCache
|
|
32
32
|
from ...generation import GenerationMixin
|
|
33
|
-
from ...integrations import
|
|
33
|
+
from ...integrations import (
|
|
34
|
+
use_experts_implementation,
|
|
35
|
+
use_kernel_forward_from_hub,
|
|
36
|
+
use_kernel_func_from_hub,
|
|
37
|
+
use_kernelized_func,
|
|
38
|
+
)
|
|
34
39
|
from ...masking_utils import create_causal_mask
|
|
35
40
|
from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
|
|
36
41
|
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
|
37
42
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
38
43
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
39
44
|
from ...processing_utils import Unpack
|
|
40
|
-
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
41
|
-
from ...utils.generic import check_model_inputs
|
|
45
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
|
|
46
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
42
47
|
from .configuration_hunyuan_v1_moe import HunYuanMoEV1Config
|
|
43
48
|
|
|
44
49
|
|
|
@@ -152,6 +157,7 @@ def eager_attention_forward(
|
|
|
152
157
|
return attn_output, attn_weights
|
|
153
158
|
|
|
154
159
|
|
|
160
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
155
161
|
class HunYuanMoEV1Attention(nn.Module):
|
|
156
162
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
157
163
|
|
|
@@ -177,7 +183,6 @@ class HunYuanMoEV1Attention(nn.Module):
|
|
|
177
183
|
self.o_proj = nn.Linear(
|
|
178
184
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
179
185
|
)
|
|
180
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
181
186
|
self.query_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
|
182
187
|
self.key_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
|
183
188
|
|
|
@@ -244,6 +249,7 @@ class HunYuanMoEV1Gate(nn.Module):
|
|
|
244
249
|
return logits
|
|
245
250
|
|
|
246
251
|
|
|
252
|
+
@use_experts_implementation
|
|
247
253
|
class HunYuanMoEV1Experts(nn.Module):
|
|
248
254
|
"""Collection of expert weights stored as 3D tensors."""
|
|
249
255
|
|
|
@@ -371,7 +377,9 @@ class HunYuanMoEV1PreTrainedModel(PreTrainedModel):
|
|
|
371
377
|
_supports_flash_attn = True
|
|
372
378
|
_supports_sdpa = True
|
|
373
379
|
_supports_flex_attn = True
|
|
374
|
-
_can_compile_fullgraph =
|
|
380
|
+
_can_compile_fullgraph = (
|
|
381
|
+
is_grouped_mm_available()
|
|
382
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
375
383
|
_supports_attention_backend = True
|
|
376
384
|
_can_record_outputs = {
|
|
377
385
|
"hidden_states": HunYuanMoEV1DecoderLayer,
|
|
@@ -413,7 +421,7 @@ class HunYuanMoEV1RotaryEmbedding(nn.Module):
|
|
|
413
421
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
414
422
|
|
|
415
423
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
416
|
-
self.original_inv_freq =
|
|
424
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
417
425
|
|
|
418
426
|
@staticmethod
|
|
419
427
|
def compute_default_rope_parameters(
|
|
@@ -452,7 +460,7 @@ class HunYuanMoEV1RotaryEmbedding(nn.Module):
|
|
|
452
460
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
453
461
|
|
|
454
462
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
455
|
-
with
|
|
463
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
456
464
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
457
465
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
458
466
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -25,7 +25,7 @@ from ... import initialization as init
|
|
|
25
25
|
from ...cache_utils import Cache
|
|
26
26
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
27
27
|
from ...processing_utils import Unpack
|
|
28
|
-
from ...utils import TransformersKwargs, logging
|
|
28
|
+
from ...utils import TransformersKwargs, is_grouped_mm_available, logging
|
|
29
29
|
from ..hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1RotaryEmbedding
|
|
30
30
|
from ..llama.modeling_llama import (
|
|
31
31
|
LlamaAttention,
|
|
@@ -177,7 +177,9 @@ class HunYuanMoEV1DecoderLayer(LlamaDecoderLayer):
|
|
|
177
177
|
|
|
178
178
|
|
|
179
179
|
class HunYuanMoEV1PreTrainedModel(LlamaPreTrainedModel):
|
|
180
|
-
_can_compile_fullgraph =
|
|
180
|
+
_can_compile_fullgraph = (
|
|
181
|
+
is_grouped_mm_available()
|
|
182
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
181
183
|
|
|
182
184
|
@torch.no_grad()
|
|
183
185
|
def _init_weights(self, module):
|
|
@@ -593,16 +593,32 @@ class IBertPreTrainedModel(PreTrainedModel):
|
|
|
593
593
|
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
594
594
|
if module.bias is not None:
|
|
595
595
|
init.zeros_(module.bias)
|
|
596
|
+
if getattr(module, "weight_integer", None) is not None:
|
|
597
|
+
init.zeros_(module.weight_integer)
|
|
598
|
+
init.zeros_(module.fc_scaling_factor)
|
|
599
|
+
if getattr(module, "bias_integer", None) is not None:
|
|
600
|
+
init.zeros_(module.bias_integer)
|
|
596
601
|
elif isinstance(module, (QuantEmbedding, nn.Embedding)):
|
|
597
602
|
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
598
603
|
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
|
|
599
604
|
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
|
|
600
605
|
init.zeros_(module.weight[module.padding_idx])
|
|
606
|
+
if getattr(module, "weight_scaling_factor", None) is not None:
|
|
607
|
+
init.zeros_(module.weight_scaling_factor)
|
|
608
|
+
init.zeros_(module.weight_integer)
|
|
601
609
|
elif isinstance(module, (IntLayerNorm, nn.LayerNorm)):
|
|
602
610
|
init.zeros_(module.bias)
|
|
603
611
|
init.ones_(module.weight)
|
|
612
|
+
if getattr(module, "shift", None) is not None:
|
|
613
|
+
init.zeros_(module.shift)
|
|
604
614
|
elif isinstance(module, IBertLMHead):
|
|
605
615
|
init.zeros_(module.bias)
|
|
616
|
+
elif isinstance(module, IBertEmbeddings):
|
|
617
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
618
|
+
elif isinstance(module, QuantAct):
|
|
619
|
+
init.constant_(module.x_min, -1e-5)
|
|
620
|
+
init.constant_(module.x_max, 1e-5)
|
|
621
|
+
init.zeros_(module.act_scaling_factor)
|
|
606
622
|
|
|
607
623
|
def resize_token_embeddings(self, new_num_tokens=None):
|
|
608
624
|
raise NotImplementedError("`resize_token_embeddings` is not supported for I-BERT.")
|
|
@@ -653,6 +669,7 @@ class IBertModel(IBertPreTrainedModel):
|
|
|
653
669
|
output_attentions: Optional[bool] = None,
|
|
654
670
|
output_hidden_states: Optional[bool] = None,
|
|
655
671
|
return_dict: Optional[bool] = None,
|
|
672
|
+
**kwargs,
|
|
656
673
|
) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, tuple[torch.FloatTensor]]:
|
|
657
674
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
658
675
|
output_hidden_states = (
|
|
@@ -746,6 +763,7 @@ class IBertForMaskedLM(IBertPreTrainedModel):
|
|
|
746
763
|
output_attentions: Optional[bool] = None,
|
|
747
764
|
output_hidden_states: Optional[bool] = None,
|
|
748
765
|
return_dict: Optional[bool] = None,
|
|
766
|
+
**kwargs,
|
|
749
767
|
) -> Union[MaskedLMOutput, tuple[torch.FloatTensor]]:
|
|
750
768
|
r"""
|
|
751
769
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -836,6 +854,7 @@ class IBertForSequenceClassification(IBertPreTrainedModel):
|
|
|
836
854
|
output_attentions: Optional[bool] = None,
|
|
837
855
|
output_hidden_states: Optional[bool] = None,
|
|
838
856
|
return_dict: Optional[bool] = None,
|
|
857
|
+
**kwargs,
|
|
839
858
|
) -> Union[SequenceClassifierOutput, tuple[torch.FloatTensor]]:
|
|
840
859
|
r"""
|
|
841
860
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -916,6 +935,7 @@ class IBertForMultipleChoice(IBertPreTrainedModel):
|
|
|
916
935
|
output_attentions: Optional[bool] = None,
|
|
917
936
|
output_hidden_states: Optional[bool] = None,
|
|
918
937
|
return_dict: Optional[bool] = None,
|
|
938
|
+
**kwargs,
|
|
919
939
|
) -> Union[MultipleChoiceModelOutput, tuple[torch.FloatTensor]]:
|
|
920
940
|
r"""
|
|
921
941
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -1018,6 +1038,7 @@ class IBertForTokenClassification(IBertPreTrainedModel):
|
|
|
1018
1038
|
output_attentions: Optional[bool] = None,
|
|
1019
1039
|
output_hidden_states: Optional[bool] = None,
|
|
1020
1040
|
return_dict: Optional[bool] = None,
|
|
1041
|
+
**kwargs,
|
|
1021
1042
|
) -> Union[TokenClassifierOutput, tuple[torch.FloatTensor]]:
|
|
1022
1043
|
r"""
|
|
1023
1044
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1102,6 +1123,7 @@ class IBertForQuestionAnswering(IBertPreTrainedModel):
|
|
|
1102
1123
|
output_attentions: Optional[bool] = None,
|
|
1103
1124
|
output_hidden_states: Optional[bool] = None,
|
|
1104
1125
|
return_dict: Optional[bool] = None,
|
|
1126
|
+
**kwargs,
|
|
1105
1127
|
) -> Union[QuestionAnsweringModelOutput, tuple[torch.FloatTensor]]:
|
|
1106
1128
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
1107
1129
|
|
|
@@ -840,6 +840,7 @@ class IdeficsPreTrainedModel(PreTrainedModel):
|
|
|
840
840
|
super()._init_weights(module)
|
|
841
841
|
if isinstance(module, IdeficsVisionEmbeddings):
|
|
842
842
|
init.normal_(module.class_embedding)
|
|
843
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
843
844
|
elif isinstance(module, IdeficsGatedCrossAttentionLayer):
|
|
844
845
|
if self.config.alpha_initializer == "zeros":
|
|
845
846
|
init.zeros_(module.alpha_cross_attn)
|
|
@@ -852,6 +853,15 @@ class IdeficsPreTrainedModel(PreTrainedModel):
|
|
|
852
853
|
init.normal_(module.alpha_dense, mean=0.0, std=self.config.alphas_initializer_range)
|
|
853
854
|
elif isinstance(module, IdeficsPerceiverResampler):
|
|
854
855
|
init.normal_(module.latents)
|
|
856
|
+
elif isinstance(module, IdeficsEmbedding):
|
|
857
|
+
inv_freq = 1.0 / (module.base ** (torch.arange(0, module.dim, 2) / module.dim))
|
|
858
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
859
|
+
t = torch.arange(module.max_position_embeddings).type_as(inv_freq)
|
|
860
|
+
freqs = torch.einsum("i,j->ij", t, inv_freq)
|
|
861
|
+
# Different from paper, but it uses a different permutation in order to obtain the same calculation
|
|
862
|
+
emb = torch.cat((freqs, freqs), dim=-1)
|
|
863
|
+
init.copy_(module.cos_cached, emb.cos())
|
|
864
|
+
init.copy_(module.sin_cached, emb.sin())
|
|
855
865
|
|
|
856
866
|
|
|
857
867
|
@auto_docstring
|
|
@@ -1107,31 +1117,15 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel, GenerationMixin):
|
|
|
1107
1117
|
bias=False,
|
|
1108
1118
|
partially_freeze=config.freeze_lm_head,
|
|
1109
1119
|
)
|
|
1120
|
+
if config.additional_vocab_size > 0:
|
|
1121
|
+
self._tied_weights_keys = {
|
|
1122
|
+
"lm_head.weight": "model.embed_tokens.weight",
|
|
1123
|
+
"lm_head.additional_fc.weight": "model.embed_tokens.additional_embedding.weight",
|
|
1124
|
+
}
|
|
1110
1125
|
|
|
1111
1126
|
# Initialize weights and apply final processing
|
|
1112
1127
|
self.post_init()
|
|
1113
1128
|
|
|
1114
|
-
def tie_weights(self, **kwargs):
|
|
1115
|
-
"""
|
|
1116
|
-
Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
|
|
1117
|
-
IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
|
|
1118
|
-
"""
|
|
1119
|
-
output_embeddings = self.get_output_embeddings()
|
|
1120
|
-
input_embeddings = self.get_input_embeddings()
|
|
1121
|
-
|
|
1122
|
-
if getattr(self.config, "tie_word_embeddings", True):
|
|
1123
|
-
output_embeddings.weight = input_embeddings.weight
|
|
1124
|
-
if input_embeddings.num_additional_embeddings > 0:
|
|
1125
|
-
assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
|
|
1126
|
-
output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
|
|
1127
|
-
|
|
1128
|
-
if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
|
|
1129
|
-
output_embeddings.out_features = input_embeddings.num_embeddings
|
|
1130
|
-
if hasattr(output_embeddings, "out_additional_features") and hasattr(
|
|
1131
|
-
input_embeddings, "num_additional_embeddings"
|
|
1132
|
-
):
|
|
1133
|
-
output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
|
|
1134
|
-
|
|
1135
1129
|
@can_return_tuple
|
|
1136
1130
|
@auto_docstring
|
|
1137
1131
|
def forward(
|
|
@@ -452,6 +452,8 @@ class Idefics2VisionTransformer(Idefics2PreTrainedModel):
|
|
|
452
452
|
self.encoder = Idefics2Encoder(config)
|
|
453
453
|
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
|
454
454
|
|
|
455
|
+
self.post_init()
|
|
456
|
+
|
|
455
457
|
def get_input_embeddings(self):
|
|
456
458
|
return self.embeddings
|
|
457
459
|
|
|
@@ -711,6 +713,8 @@ class Idefics2PerceiverResampler(Idefics2PreTrainedModel):
|
|
|
711
713
|
self.layers = nn.ModuleList([Idefics2PerceiverLayer(config, idx) for idx in range(self.depth)])
|
|
712
714
|
self.norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
|
|
713
715
|
|
|
716
|
+
self.post_init()
|
|
717
|
+
|
|
714
718
|
@auto_docstring
|
|
715
719
|
def forward(
|
|
716
720
|
self,
|
|
@@ -1115,6 +1119,7 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
|
|
|
1115
1119
|
pixel_attention_mask=None,
|
|
1116
1120
|
image_hidden_states=None,
|
|
1117
1121
|
logits_to_keep=None,
|
|
1122
|
+
is_first_iteration=False,
|
|
1118
1123
|
**kwargs,
|
|
1119
1124
|
):
|
|
1120
1125
|
# Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
|
|
@@ -1130,10 +1135,11 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
|
|
|
1130
1135
|
pixel_attention_mask=pixel_attention_mask,
|
|
1131
1136
|
image_hidden_states=image_hidden_states,
|
|
1132
1137
|
logits_to_keep=logits_to_keep,
|
|
1138
|
+
is_first_iteration=is_first_iteration,
|
|
1133
1139
|
**kwargs,
|
|
1134
1140
|
)
|
|
1135
1141
|
|
|
1136
|
-
if image_hidden_states is not None or
|
|
1142
|
+
if image_hidden_states is not None or not is_first_iteration:
|
|
1137
1143
|
model_inputs["pixel_values"] = None
|
|
1138
1144
|
model_inputs["pixel_attention_mask"] = None
|
|
1139
1145
|
|
|
@@ -458,6 +458,8 @@ class Idefics3VisionTransformer(Idefics3PreTrainedModel):
|
|
|
458
458
|
self.patch_size = config.patch_size
|
|
459
459
|
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
|
460
460
|
|
|
461
|
+
self.post_init()
|
|
462
|
+
|
|
461
463
|
# Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer.get_input_embeddings
|
|
462
464
|
def get_input_embeddings(self):
|
|
463
465
|
return self.embeddings
|
|
@@ -887,6 +889,7 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
|
|
|
887
889
|
pixel_attention_mask=None,
|
|
888
890
|
image_hidden_states=None,
|
|
889
891
|
logits_to_keep=None,
|
|
892
|
+
is_first_iteration=False,
|
|
890
893
|
**kwargs,
|
|
891
894
|
):
|
|
892
895
|
# Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
|
|
@@ -902,10 +905,11 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
|
|
|
902
905
|
pixel_attention_mask=pixel_attention_mask,
|
|
903
906
|
image_hidden_states=image_hidden_states,
|
|
904
907
|
logits_to_keep=logits_to_keep,
|
|
908
|
+
is_first_iteration=is_first_iteration,
|
|
905
909
|
**kwargs,
|
|
906
910
|
)
|
|
907
911
|
|
|
908
|
-
if image_hidden_states is not None or
|
|
912
|
+
if image_hidden_states is not None or not is_first_iteration:
|
|
909
913
|
model_inputs["pixel_values"] = None
|
|
910
914
|
model_inputs["pixel_attention_mask"] = None
|
|
911
915
|
|
|
@@ -164,12 +164,8 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
|
|
|
164
164
|
|
|
165
165
|
input_ids = reorder_images(input_ids_grouped, grouped_images_index)
|
|
166
166
|
|
|
167
|
-
return BatchFeature(
|
|
168
|
-
data={"input_ids": torch.stack(input_ids, dim=0) if return_tensors else input_ids},
|
|
169
|
-
tensor_type=return_tensors,
|
|
170
|
-
)
|
|
167
|
+
return BatchFeature(data={"input_ids": input_ids}, tensor_type=return_tensors)
|
|
171
168
|
|
|
172
|
-
pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values
|
|
173
169
|
return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
|
|
174
170
|
|
|
175
171
|
def to_dict(self):
|