transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
from ...utils import _LazyModule
|
|
17
|
+
from ...utils.import_utils import define_import_structure
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from .configuration_fast_vlm import *
|
|
22
|
+
from .modeling_fast_vlm import *
|
|
23
|
+
else:
|
|
24
|
+
import sys
|
|
25
|
+
|
|
26
|
+
_file = globals()["__file__"]
|
|
27
|
+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
2
|
+
# This file was automatically generated from src/transformers/models/fast_vlm/modular_fast_vlm.py.
|
|
3
|
+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
|
4
|
+
# the file from the modular. If any change should be done, please apply the change to the
|
|
5
|
+
# modular_fast_vlm.py file directly. One of our CI enforces this.
|
|
6
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
7
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
8
|
+
#
|
|
9
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
10
|
+
# you may not use this file except in compliance with the License.
|
|
11
|
+
# You may obtain a copy of the License at
|
|
12
|
+
#
|
|
13
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
14
|
+
#
|
|
15
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
16
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
17
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
18
|
+
# See the License for the specific language governing permissions and
|
|
19
|
+
# limitations under the License.
|
|
20
|
+
|
|
21
|
+
from ...configuration_utils import PreTrainedConfig
|
|
22
|
+
from ..auto import CONFIG_MAPPING, AutoConfig
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class FastVlmConfig(PreTrainedConfig):
|
|
26
|
+
r"""
|
|
27
|
+
This is the configuration class to store the configuration of a [`FastVlmForConditionalGeneration`]. It is used to instantiate a
|
|
28
|
+
FastVLM model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
29
|
+
with the defaults will yield the same configuration as the one of FastVLM-7B.
|
|
30
|
+
|
|
31
|
+
e.g. [KamilaMila/FastVLM-7B](https://huggingface.co/KamilaMila/FastVLM-7B)
|
|
32
|
+
|
|
33
|
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
|
34
|
+
documentation from [`PretrainedConfig`] for more information.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `TimmWrapperConfig` for `fastvit_mci3`):
|
|
38
|
+
The config object or dictionary of the vision backbone.
|
|
39
|
+
text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
|
|
40
|
+
The config object or dictionary of the text backbone.
|
|
41
|
+
image_token_id (`int`, *optional*, defaults to 151646):
|
|
42
|
+
The image token index to encode the image prompt.
|
|
43
|
+
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
|
|
44
|
+
The activation function used by the multimodal projector.
|
|
45
|
+
vision_feature_select_strategy (`str`, *optional*, defaults to `"full"`):
|
|
46
|
+
The feature selection strategy used to select the vision feature from the vision backbone.
|
|
47
|
+
Only "full" supported.
|
|
48
|
+
vision_feature_layer (`Union[int, list[int]]`, *optional*, defaults to -1):
|
|
49
|
+
The index of the layer to select the vision feature. If multiple indices are provided,
|
|
50
|
+
the vision feature of the corresponding indices will be concatenated to form the
|
|
51
|
+
vision features. Only -1 supported.
|
|
52
|
+
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
|
|
53
|
+
Whether to use bias in the multimodal projector.
|
|
54
|
+
|
|
55
|
+
Example:
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
>>> from transformers import FastVlmForConditionalGeneration, FastVlmConfig
|
|
59
|
+
|
|
60
|
+
>>> # Initializing a FastVLM-7B style configuration
|
|
61
|
+
>>> configuration = FastVlmConfig()
|
|
62
|
+
|
|
63
|
+
>>> # Initializing a model from the FastVLM-7B style configuration
|
|
64
|
+
>>> model = FastVlmForConditionalGeneration(configuration)
|
|
65
|
+
|
|
66
|
+
>>> # Accessing the model configuration
|
|
67
|
+
>>> configuration = model.config
|
|
68
|
+
```"""
|
|
69
|
+
|
|
70
|
+
model_type = "fast_vlm"
|
|
71
|
+
attribute_map = {
|
|
72
|
+
"image_token_id": "image_token_index",
|
|
73
|
+
}
|
|
74
|
+
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
vision_config=None,
|
|
79
|
+
text_config=None,
|
|
80
|
+
image_token_id=151646,
|
|
81
|
+
projector_hidden_act="gelu",
|
|
82
|
+
vision_feature_select_strategy="full",
|
|
83
|
+
vision_feature_layer=-1,
|
|
84
|
+
multimodal_projector_bias=True,
|
|
85
|
+
**kwargs,
|
|
86
|
+
):
|
|
87
|
+
self.image_token_id = image_token_id
|
|
88
|
+
self.projector_hidden_act = projector_hidden_act
|
|
89
|
+
|
|
90
|
+
if vision_feature_select_strategy != "full":
|
|
91
|
+
raise ValueError(
|
|
92
|
+
f"Unexpected select feature strategy: {vision_feature_select_strategy}. Only 'full' is supported in FastVLM."
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if vision_feature_layer != -1:
|
|
96
|
+
raise ValueError(
|
|
97
|
+
f"Unexpected vision feature layer: {vision_feature_layer}. Only -1 is supported in FastVLM."
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
self.vision_feature_select_strategy = vision_feature_select_strategy
|
|
101
|
+
self.vision_feature_layer = vision_feature_layer
|
|
102
|
+
|
|
103
|
+
if isinstance(vision_config, dict):
|
|
104
|
+
vision_config["model_type"] = vision_config.get("model_type", "timm_wrapper")
|
|
105
|
+
vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
|
|
106
|
+
elif vision_config is None:
|
|
107
|
+
vision_config = CONFIG_MAPPING["timm_wrapper"](
|
|
108
|
+
architecture="fastvit_mci3",
|
|
109
|
+
do_pooling=True,
|
|
110
|
+
global_pool="avg",
|
|
111
|
+
hidden_size=3072,
|
|
112
|
+
initializer_range=0.02,
|
|
113
|
+
model_args={"inference_mode": True},
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
self.vision_config = vision_config
|
|
117
|
+
|
|
118
|
+
if isinstance(text_config, dict):
|
|
119
|
+
text_config["model_type"] = text_config.get("model_type", "qwen2")
|
|
120
|
+
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
|
|
121
|
+
elif text_config is None:
|
|
122
|
+
text_config = CONFIG_MAPPING["qwen2"](
|
|
123
|
+
hidden_size=3584,
|
|
124
|
+
vocab_size=152128,
|
|
125
|
+
intermediate_size=18944,
|
|
126
|
+
num_attention_heads=28,
|
|
127
|
+
num_key_value_heads=4,
|
|
128
|
+
num_hidden_layers=28,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
self.text_config = text_config
|
|
132
|
+
self.multimodal_projector_bias = multimodal_projector_bias
|
|
133
|
+
|
|
134
|
+
super().__init__(**kwargs)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
__all__ = ["FastVlmConfig"]
|
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
2
|
+
# This file was automatically generated from src/transformers/models/fast_vlm/modular_fast_vlm.py.
|
|
3
|
+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
|
4
|
+
# the file from the modular. If any change should be done, please apply the change to the
|
|
5
|
+
# modular_fast_vlm.py file directly. One of our CI enforces this.
|
|
6
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
7
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
8
|
+
#
|
|
9
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
10
|
+
# you may not use this file except in compliance with the License.
|
|
11
|
+
# You may obtain a copy of the License at
|
|
12
|
+
#
|
|
13
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
14
|
+
#
|
|
15
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
16
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
17
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
18
|
+
# See the License for the specific language governing permissions and
|
|
19
|
+
# limitations under the License.
|
|
20
|
+
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from typing import Optional, Union
|
|
23
|
+
|
|
24
|
+
import torch
|
|
25
|
+
from torch import nn
|
|
26
|
+
|
|
27
|
+
from ...activations import ACT2FN
|
|
28
|
+
from ...cache_utils import Cache
|
|
29
|
+
from ...generation import GenerationMixin
|
|
30
|
+
from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
|
|
31
|
+
from ...modeling_utils import PreTrainedModel
|
|
32
|
+
from ...processing_utils import Unpack
|
|
33
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
34
|
+
from ..auto import AutoModel
|
|
35
|
+
from .configuration_fast_vlm import FastVlmConfig
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class FastVlmMultiModalProjector(nn.Module):
|
|
39
|
+
def __init__(self, config: FastVlmConfig):
|
|
40
|
+
super().__init__()
|
|
41
|
+
self.linear_1 = nn.Linear(
|
|
42
|
+
config.vision_config.hidden_size,
|
|
43
|
+
config.text_config.hidden_size,
|
|
44
|
+
bias=config.multimodal_projector_bias,
|
|
45
|
+
)
|
|
46
|
+
self.act = ACT2FN[config.projector_hidden_act]
|
|
47
|
+
self.linear_2 = nn.Linear(
|
|
48
|
+
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def forward(self, image_features):
|
|
52
|
+
hidden_states = self.linear_1(image_features)
|
|
53
|
+
hidden_states = self.act(hidden_states)
|
|
54
|
+
hidden_states = self.linear_2(hidden_states)
|
|
55
|
+
return hidden_states
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@auto_docstring
|
|
59
|
+
class FastVlmPreTrainedModel(PreTrainedModel):
|
|
60
|
+
config: FastVlmConfig
|
|
61
|
+
base_model_prefix = "model"
|
|
62
|
+
input_modalities = ("image", "text")
|
|
63
|
+
supports_gradient_checkpointing = True
|
|
64
|
+
_skip_keys_device_placement = "past_key_values"
|
|
65
|
+
|
|
66
|
+
_supports_flash_attn = True
|
|
67
|
+
_supports_sdpa = True
|
|
68
|
+
|
|
69
|
+
_can_compile_fullgraph = True
|
|
70
|
+
_supports_flex_attn = True
|
|
71
|
+
_supports_attention_backend = True
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
@auto_docstring(
|
|
76
|
+
custom_intro="""
|
|
77
|
+
Base class for FastVlm outputs, with hidden states and attentions.
|
|
78
|
+
"""
|
|
79
|
+
)
|
|
80
|
+
class FastVlmModelOutputWithPast(BaseModelOutputWithPast):
|
|
81
|
+
r"""
|
|
82
|
+
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
|
83
|
+
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
|
|
84
|
+
|
|
85
|
+
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
|
86
|
+
`past_key_values` input) to speed up sequential decoding.
|
|
87
|
+
image_hidden_states (`torch.FloatTensor`, *optional*):
|
|
88
|
+
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
|
89
|
+
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
image_hidden_states: Optional[torch.FloatTensor] = None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@auto_docstring(
|
|
96
|
+
custom_intro="""
|
|
97
|
+
The FastVlm model which consists of a vision backbone and a language model, without a language modeling head.
|
|
98
|
+
"""
|
|
99
|
+
)
|
|
100
|
+
class FastVlmModel(FastVlmPreTrainedModel):
|
|
101
|
+
_checkpoint_conversion_mapping = {}
|
|
102
|
+
|
|
103
|
+
def __init__(self, config: FastVlmConfig):
|
|
104
|
+
super().__init__(config)
|
|
105
|
+
self.vision_tower = AutoModel.from_config(config.vision_config)
|
|
106
|
+
|
|
107
|
+
self.multi_modal_projector = FastVlmMultiModalProjector(config)
|
|
108
|
+
self.language_model = AutoModel.from_config(config.text_config)
|
|
109
|
+
self.post_init()
|
|
110
|
+
|
|
111
|
+
def get_input_embeddings(self):
|
|
112
|
+
return self.language_model.get_input_embeddings()
|
|
113
|
+
|
|
114
|
+
def set_input_embeddings(self, value):
|
|
115
|
+
self.language_model.set_input_embeddings(value)
|
|
116
|
+
|
|
117
|
+
def get_image_features(
|
|
118
|
+
self,
|
|
119
|
+
pixel_values: torch.FloatTensor,
|
|
120
|
+
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
|
121
|
+
vision_feature_select_strategy: Optional[str] = None,
|
|
122
|
+
**kwargs,
|
|
123
|
+
):
|
|
124
|
+
"""
|
|
125
|
+
Obtains image last hidden states from the vision tower and apply multimodal projection.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
|
|
129
|
+
The tensors corresponding to the input images.
|
|
130
|
+
vision_feature_layer (`Union[int, list[int]]`, *optional*):
|
|
131
|
+
The index/indices of the layer to select the vision feature. Only -1 supported.
|
|
132
|
+
vision_feature_select_strategy (`str`, *optional*):
|
|
133
|
+
The feature selection strategy used to select the vision feature from the vision backbone.
|
|
134
|
+
Only "full" supported.
|
|
135
|
+
Returns:
|
|
136
|
+
image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
|
|
137
|
+
"""
|
|
138
|
+
vision_feature_layer = (
|
|
139
|
+
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
|
140
|
+
)
|
|
141
|
+
vision_feature_select_strategy = (
|
|
142
|
+
vision_feature_select_strategy
|
|
143
|
+
if vision_feature_select_strategy is not None
|
|
144
|
+
else self.config.vision_feature_select_strategy
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
|
148
|
+
image_outputs = self.vision_tower(pixel_values, **kwargs)
|
|
149
|
+
|
|
150
|
+
# since the vision tower is hybrid in FastVLM, its output needs to be handled differently from Llava
|
|
151
|
+
selected_image_feature = image_outputs.last_hidden_state
|
|
152
|
+
selected_image_feature = selected_image_feature.flatten(2).permute(0, 2, 1)
|
|
153
|
+
image_features = self.multi_modal_projector(selected_image_feature)
|
|
154
|
+
image_features = list(image_features)
|
|
155
|
+
return image_features
|
|
156
|
+
|
|
157
|
+
def get_placeholder_mask(
|
|
158
|
+
self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
|
|
159
|
+
):
|
|
160
|
+
"""
|
|
161
|
+
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
|
|
162
|
+
equal to the length of multimodal features. If the lengths are different, an error is raised.
|
|
163
|
+
"""
|
|
164
|
+
if input_ids is None:
|
|
165
|
+
special_image_mask = inputs_embeds == self.get_input_embeddings()(
|
|
166
|
+
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
|
|
167
|
+
)
|
|
168
|
+
special_image_mask = special_image_mask.all(-1)
|
|
169
|
+
else:
|
|
170
|
+
special_image_mask = input_ids == self.config.image_token_id
|
|
171
|
+
|
|
172
|
+
n_image_tokens = special_image_mask.sum()
|
|
173
|
+
special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
|
174
|
+
n_image_features = image_features.shape[0] * image_features.shape[1]
|
|
175
|
+
if inputs_embeds[special_image_mask].numel() != image_features.numel():
|
|
176
|
+
raise ValueError(
|
|
177
|
+
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
|
178
|
+
)
|
|
179
|
+
return special_image_mask
|
|
180
|
+
|
|
181
|
+
@can_return_tuple
|
|
182
|
+
@auto_docstring
|
|
183
|
+
def forward(
|
|
184
|
+
self,
|
|
185
|
+
input_ids: Optional[torch.LongTensor] = None,
|
|
186
|
+
pixel_values: Optional[torch.FloatTensor] = None,
|
|
187
|
+
attention_mask: Optional[torch.Tensor] = None,
|
|
188
|
+
position_ids: Optional[torch.LongTensor] = None,
|
|
189
|
+
past_key_values: Optional[Cache] = None,
|
|
190
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
191
|
+
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
|
192
|
+
vision_feature_select_strategy: Optional[str] = None,
|
|
193
|
+
cache_position: Optional[torch.LongTensor] = None,
|
|
194
|
+
image_sizes: Optional[torch.Tensor] = None,
|
|
195
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
196
|
+
) -> Union[tuple, FastVlmModelOutputWithPast]:
|
|
197
|
+
r"""
|
|
198
|
+
vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
|
|
199
|
+
The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
|
|
200
|
+
corresponding indices will be concatenated to form the vision features. Only -1 supported.
|
|
201
|
+
vision_feature_select_strategy (`str`, *optional*):
|
|
202
|
+
The feature selection strategy used to select the vision feature from the vision backbone. Only "full" supported.
|
|
203
|
+
"""
|
|
204
|
+
vision_feature_layer = (
|
|
205
|
+
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
|
206
|
+
)
|
|
207
|
+
vision_feature_select_strategy = (
|
|
208
|
+
vision_feature_select_strategy
|
|
209
|
+
if vision_feature_select_strategy is not None
|
|
210
|
+
else self.config.vision_feature_select_strategy
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
214
|
+
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
|
215
|
+
|
|
216
|
+
if inputs_embeds is None:
|
|
217
|
+
inputs_embeds = self.get_input_embeddings()(input_ids)
|
|
218
|
+
|
|
219
|
+
if pixel_values is not None:
|
|
220
|
+
image_features = self.get_image_features(
|
|
221
|
+
pixel_values=pixel_values,
|
|
222
|
+
vision_feature_layer=vision_feature_layer,
|
|
223
|
+
vision_feature_select_strategy=vision_feature_select_strategy,
|
|
224
|
+
image_sizes=image_sizes,
|
|
225
|
+
)
|
|
226
|
+
image_features = torch.cat(image_features, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
|
|
227
|
+
special_image_mask = self.get_placeholder_mask(
|
|
228
|
+
input_ids, inputs_embeds=inputs_embeds, image_features=image_features
|
|
229
|
+
)
|
|
230
|
+
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
|
231
|
+
|
|
232
|
+
outputs = self.language_model(
|
|
233
|
+
attention_mask=attention_mask,
|
|
234
|
+
position_ids=position_ids,
|
|
235
|
+
past_key_values=past_key_values,
|
|
236
|
+
inputs_embeds=inputs_embeds,
|
|
237
|
+
cache_position=cache_position,
|
|
238
|
+
**kwargs,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return FastVlmModelOutputWithPast(
|
|
242
|
+
last_hidden_state=outputs.last_hidden_state,
|
|
243
|
+
past_key_values=outputs.past_key_values,
|
|
244
|
+
hidden_states=outputs.hidden_states,
|
|
245
|
+
attentions=outputs.attentions,
|
|
246
|
+
image_hidden_states=image_features if pixel_values is not None else None,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
@dataclass
|
|
251
|
+
@auto_docstring(
|
|
252
|
+
custom_intro="""
|
|
253
|
+
Base class for FastVlm causal language model (or autoregressive) outputs.
|
|
254
|
+
"""
|
|
255
|
+
)
|
|
256
|
+
class FastVlmCausalLMOutputWithPast(ModelOutput):
|
|
257
|
+
r"""
|
|
258
|
+
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
|
259
|
+
Language modeling loss (for next-token prediction).
|
|
260
|
+
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
|
261
|
+
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
|
262
|
+
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
|
263
|
+
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
|
|
264
|
+
|
|
265
|
+
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
|
266
|
+
`past_key_values` input) to speed up sequential decoding.
|
|
267
|
+
image_hidden_states (`torch.FloatTensor`, *optional*):
|
|
268
|
+
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
|
269
|
+
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
|
270
|
+
"""
|
|
271
|
+
|
|
272
|
+
loss: Optional[torch.FloatTensor] = None
|
|
273
|
+
logits: Optional[torch.FloatTensor] = None
|
|
274
|
+
past_key_values: Optional[Cache] = None
|
|
275
|
+
hidden_states: Optional[tuple[torch.FloatTensor]] = None
|
|
276
|
+
attentions: Optional[tuple[torch.FloatTensor]] = None
|
|
277
|
+
image_hidden_states: Optional[torch.FloatTensor] = None
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
@auto_docstring(
|
|
281
|
+
custom_intro="""
|
|
282
|
+
The FastVlm model which consists of a vision backbone and a language model.
|
|
283
|
+
"""
|
|
284
|
+
)
|
|
285
|
+
class FastVlmForConditionalGeneration(FastVlmPreTrainedModel, GenerationMixin):
|
|
286
|
+
_checkpoint_conversion_mapping = {}
|
|
287
|
+
_tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
|
|
288
|
+
|
|
289
|
+
def __init__(self, config: FastVlmConfig):
|
|
290
|
+
super().__init__(config)
|
|
291
|
+
self.model = FastVlmModel(config)
|
|
292
|
+
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
|
|
293
|
+
self.post_init()
|
|
294
|
+
|
|
295
|
+
def get_input_embeddings(self):
|
|
296
|
+
return self.model.get_input_embeddings()
|
|
297
|
+
|
|
298
|
+
def set_input_embeddings(self, value):
|
|
299
|
+
self.model.set_input_embeddings(value)
|
|
300
|
+
|
|
301
|
+
def get_output_embeddings(self) -> nn.Module:
|
|
302
|
+
return self.lm_head
|
|
303
|
+
|
|
304
|
+
def get_image_features(
|
|
305
|
+
self,
|
|
306
|
+
pixel_values: torch.FloatTensor,
|
|
307
|
+
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
|
308
|
+
vision_feature_select_strategy: Optional[str] = None,
|
|
309
|
+
**kwargs,
|
|
310
|
+
):
|
|
311
|
+
return self.model.get_image_features(
|
|
312
|
+
pixel_values=pixel_values,
|
|
313
|
+
vision_feature_layer=vision_feature_layer,
|
|
314
|
+
vision_feature_select_strategy=vision_feature_select_strategy,
|
|
315
|
+
**kwargs,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
@can_return_tuple
|
|
319
|
+
@auto_docstring
|
|
320
|
+
def forward(
|
|
321
|
+
self,
|
|
322
|
+
input_ids: Optional[torch.LongTensor] = None,
|
|
323
|
+
pixel_values: Optional[torch.FloatTensor] = None,
|
|
324
|
+
attention_mask: Optional[torch.Tensor] = None,
|
|
325
|
+
position_ids: Optional[torch.LongTensor] = None,
|
|
326
|
+
past_key_values: Optional[Cache] = None,
|
|
327
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
328
|
+
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
|
329
|
+
vision_feature_select_strategy: Optional[str] = None,
|
|
330
|
+
labels: Optional[torch.LongTensor] = None,
|
|
331
|
+
cache_position: Optional[torch.LongTensor] = None,
|
|
332
|
+
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
333
|
+
image_sizes: Optional[torch.Tensor] = None,
|
|
334
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
335
|
+
) -> Union[tuple, FastVlmCausalLMOutputWithPast]:
|
|
336
|
+
r"""
|
|
337
|
+
vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
|
|
338
|
+
The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
|
|
339
|
+
corresponding indices will be concatenated to form the vision features. Only -1 supported.
|
|
340
|
+
vision_feature_select_strategy (`str`, *optional*):
|
|
341
|
+
The feature selection strategy used to select the vision feature from the vision backbone. Only "full" supported.
|
|
342
|
+
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
343
|
+
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
|
344
|
+
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
|
345
|
+
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
|
346
|
+
|
|
347
|
+
Example:
|
|
348
|
+
|
|
349
|
+
```python
|
|
350
|
+
>>> from PIL import Image
|
|
351
|
+
>>> import requests
|
|
352
|
+
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
|
|
353
|
+
>>> import torch
|
|
354
|
+
|
|
355
|
+
>>> device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
356
|
+
|
|
357
|
+
>>> model = AutoModelForImageTextToText.from_pretrained("KamilaMila/FastVLM-0.5B").to(device)
|
|
358
|
+
>>> processor = AutoProcessor.from_pretrained("KamilaMila/FastVLM-0.5B")
|
|
359
|
+
|
|
360
|
+
>>> conversation = [
|
|
361
|
+
{
|
|
362
|
+
"role": "user",
|
|
363
|
+
"content": [
|
|
364
|
+
{"type": "text", "text": "What are these?"},
|
|
365
|
+
{"type": "image"}
|
|
366
|
+
]
|
|
367
|
+
}
|
|
368
|
+
]
|
|
369
|
+
|
|
370
|
+
>>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
|
371
|
+
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
|
|
372
|
+
>>> image = Image.open(requests.get(url, stream=True).raw)
|
|
373
|
+
|
|
374
|
+
>>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
|
|
375
|
+
|
|
376
|
+
>>> # Generate
|
|
377
|
+
>>> generated_ids = model.generate(**inputs, max_new_tokens=15)
|
|
378
|
+
>>> print(processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
|
|
379
|
+
system\n You are a helpful assistant.\n user\n What are these?\n assistant\n The image depicts a traditional Chinese street...
|
|
380
|
+
```"""
|
|
381
|
+
vision_feature_layer = (
|
|
382
|
+
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
|
383
|
+
)
|
|
384
|
+
vision_feature_select_strategy = (
|
|
385
|
+
vision_feature_select_strategy
|
|
386
|
+
if vision_feature_select_strategy is not None
|
|
387
|
+
else self.config.vision_feature_select_strategy
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
outputs = self.model(
|
|
391
|
+
input_ids=input_ids,
|
|
392
|
+
pixel_values=pixel_values,
|
|
393
|
+
attention_mask=attention_mask,
|
|
394
|
+
position_ids=position_ids,
|
|
395
|
+
past_key_values=past_key_values,
|
|
396
|
+
inputs_embeds=inputs_embeds,
|
|
397
|
+
vision_feature_layer=vision_feature_layer,
|
|
398
|
+
vision_feature_select_strategy=vision_feature_select_strategy,
|
|
399
|
+
cache_position=cache_position,
|
|
400
|
+
image_sizes=image_sizes,
|
|
401
|
+
**kwargs,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
hidden_states = outputs[0]
|
|
405
|
+
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
|
406
|
+
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
|
|
407
|
+
logits = self.lm_head(hidden_states[:, slice_indices, :])
|
|
408
|
+
|
|
409
|
+
loss = None
|
|
410
|
+
if labels is not None:
|
|
411
|
+
loss = self.loss_function(
|
|
412
|
+
logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
return FastVlmCausalLMOutputWithPast(
|
|
416
|
+
loss=loss,
|
|
417
|
+
logits=logits,
|
|
418
|
+
past_key_values=outputs.past_key_values,
|
|
419
|
+
hidden_states=outputs.hidden_states,
|
|
420
|
+
attentions=outputs.attentions,
|
|
421
|
+
image_hidden_states=outputs.image_hidden_states,
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
def prepare_inputs_for_generation(
|
|
425
|
+
self,
|
|
426
|
+
input_ids,
|
|
427
|
+
past_key_values=None,
|
|
428
|
+
inputs_embeds=None,
|
|
429
|
+
pixel_values=None,
|
|
430
|
+
attention_mask=None,
|
|
431
|
+
cache_position=None,
|
|
432
|
+
logits_to_keep=None,
|
|
433
|
+
is_first_iteration=False,
|
|
434
|
+
**kwargs,
|
|
435
|
+
):
|
|
436
|
+
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
437
|
+
|
|
438
|
+
model_inputs = super().prepare_inputs_for_generation(
|
|
439
|
+
input_ids,
|
|
440
|
+
past_key_values=past_key_values,
|
|
441
|
+
inputs_embeds=inputs_embeds,
|
|
442
|
+
attention_mask=attention_mask,
|
|
443
|
+
cache_position=cache_position,
|
|
444
|
+
logits_to_keep=logits_to_keep,
|
|
445
|
+
is_first_iteration=is_first_iteration,
|
|
446
|
+
**kwargs,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
450
|
+
# Pixel values are used only in the first iteration if available
|
|
451
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
452
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
453
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
454
|
+
model_inputs["pixel_values"] = pixel_values
|
|
455
|
+
|
|
456
|
+
return model_inputs
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
__all__ = ["FastVlmForConditionalGeneration", "FastVlmModel", "FastVlmPreTrainedModel"]
|