transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -165,63 +165,103 @@ class Lfm2VlProcessor(ProcessorMixin):
|
|
|
165
165
|
image_sizes: list[list[int]],
|
|
166
166
|
use_image_special_tokens: bool,
|
|
167
167
|
**images_kwargs,
|
|
168
|
-
):
|
|
169
|
-
|
|
168
|
+
) -> list[str]:
|
|
169
|
+
use_thumbnail = images_kwargs.get("use_thumbnail", self.image_processor.use_thumbnail)
|
|
170
|
+
image_data = iter(zip(image_rows, image_cols, image_sizes))
|
|
170
171
|
|
|
171
|
-
|
|
172
|
+
prompt_strings = []
|
|
172
173
|
for sample_text, sample_images in zip(text, images):
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
sample_text_with_image_tokens += self.image_start_token
|
|
174
|
+
text_parts = sample_text.split(self.image_token)
|
|
175
|
+
result_parts = []
|
|
176
|
+
|
|
177
|
+
for i, _ in enumerate(sample_images):
|
|
178
|
+
result_parts.append(text_parts[i])
|
|
179
179
|
|
|
180
180
|
rows, cols, image_size = next(image_data)
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
if use_image_special_tokens:
|
|
192
|
-
sample_text_with_image_tokens += self.image_thumbnail_token
|
|
193
|
-
sample_text_with_image_tokens += self.image_token * num_thumbnail_tokens
|
|
194
|
-
else:
|
|
195
|
-
sample_text_with_image_tokens += self.image_token * num_thumbnail_tokens
|
|
181
|
+
tokens_per_tile, tokens_for_image = self._get_image_num_tokens(image_size, **images_kwargs)
|
|
182
|
+
image_tokens = self._build_image_tokens(
|
|
183
|
+
rows,
|
|
184
|
+
cols,
|
|
185
|
+
tokens_per_tile,
|
|
186
|
+
tokens_for_image,
|
|
187
|
+
use_thumbnail,
|
|
188
|
+
use_image_special_tokens,
|
|
189
|
+
)
|
|
190
|
+
result_parts.append(image_tokens)
|
|
196
191
|
|
|
197
|
-
|
|
198
|
-
|
|
192
|
+
# Add remaining text after the last image
|
|
193
|
+
if len(sample_images) < len(text_parts):
|
|
194
|
+
result_parts.append(text_parts[-1])
|
|
199
195
|
|
|
200
|
-
|
|
201
|
-
prompt_strings.append(sample_text_with_image_tokens)
|
|
196
|
+
prompt_strings.append("".join(result_parts))
|
|
202
197
|
|
|
203
198
|
return prompt_strings
|
|
204
199
|
|
|
200
|
+
def _build_image_tokens(
|
|
201
|
+
self,
|
|
202
|
+
rows: int,
|
|
203
|
+
cols: int,
|
|
204
|
+
tokens_per_tile: int,
|
|
205
|
+
tokens_for_image: int,
|
|
206
|
+
use_thumbnail: bool,
|
|
207
|
+
use_image_special_tokens: bool,
|
|
208
|
+
) -> str:
|
|
209
|
+
"""Build the expanded token string for a single image."""
|
|
210
|
+
parts = []
|
|
211
|
+
|
|
212
|
+
if use_image_special_tokens:
|
|
213
|
+
parts.append(self.image_start_token)
|
|
214
|
+
|
|
215
|
+
is_multi_tile = rows > 1 or cols > 1
|
|
216
|
+
if is_multi_tile:
|
|
217
|
+
for row in range(rows):
|
|
218
|
+
for col in range(cols):
|
|
219
|
+
if use_image_special_tokens:
|
|
220
|
+
parts.append(f"<|img_row_{row + 1}_col_{col + 1}|>")
|
|
221
|
+
parts.append(self.image_token * tokens_per_tile)
|
|
222
|
+
|
|
223
|
+
if use_thumbnail:
|
|
224
|
+
if use_image_special_tokens:
|
|
225
|
+
parts.append(self.image_thumbnail_token)
|
|
226
|
+
parts.append(self.image_token * tokens_for_image)
|
|
227
|
+
else:
|
|
228
|
+
parts.append(self.image_token * tokens_for_image)
|
|
229
|
+
|
|
230
|
+
if use_image_special_tokens:
|
|
231
|
+
parts.append(self.image_end_token)
|
|
232
|
+
|
|
233
|
+
return "".join(parts)
|
|
234
|
+
|
|
235
|
+
def _compute_tokens_per_tile(self, tile_size: int, encoder_patch_size: int, downsample_factor: int) -> int:
|
|
236
|
+
"""Compute the number of tokens for a single tile."""
|
|
237
|
+
num_patches = tile_size // encoder_patch_size
|
|
238
|
+
downsampled_patches = math.ceil(num_patches / downsample_factor)
|
|
239
|
+
return downsampled_patches * downsampled_patches
|
|
240
|
+
|
|
241
|
+
def _compute_tokens_for_image(self, image_size: list[int], encoder_patch_size: int, downsample_factor: int) -> int:
|
|
242
|
+
"""Compute the number of tokens for a resized image (used for single-tile or thumbnail)."""
|
|
243
|
+
image_height, image_width = image_size
|
|
244
|
+
patches_h = math.ceil((image_height // encoder_patch_size) / downsample_factor)
|
|
245
|
+
patches_w = math.ceil((image_width // encoder_patch_size) / downsample_factor)
|
|
246
|
+
return patches_h * patches_w
|
|
247
|
+
|
|
205
248
|
def _get_image_num_tokens(self, image_size: list[int], **images_kwargs) -> tuple[int, int]:
|
|
249
|
+
"""
|
|
250
|
+
Compute token counts for image processing.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
tuple[int, int]: (tokens_per_tile, tokens_for_image)
|
|
254
|
+
- tokens_per_tile: tokens for each tile in multi-tile mode
|
|
255
|
+
- tokens_for_image: tokens for the resized image (single-tile) or thumbnail (multi-tile)
|
|
256
|
+
"""
|
|
206
257
|
tile_size = images_kwargs.get("tile_size", self.image_processor.tile_size)
|
|
207
258
|
downsample_factor = images_kwargs.get("downsample_factor", self.image_processor.downsample_factor)
|
|
208
259
|
encoder_patch_size = images_kwargs.get("encoder_patch_size", self.image_processor.encoder_patch_size)
|
|
209
|
-
use_thumbnail = images_kwargs.get("use_thumbnail", self.image_processor.use_thumbnail)
|
|
210
|
-
|
|
211
|
-
thumbnail_tokens = 0
|
|
212
|
-
if use_thumbnail:
|
|
213
|
-
image_height, image_width = image_size
|
|
214
|
-
num_patches_height = image_height // encoder_patch_size
|
|
215
|
-
num_patches_width = image_width // encoder_patch_size
|
|
216
|
-
dwn_num_patches_height = math.ceil(num_patches_height / downsample_factor)
|
|
217
|
-
dwn_num_patches_width = math.ceil(num_patches_width / downsample_factor)
|
|
218
|
-
thumbnail_tokens = dwn_num_patches_height * dwn_num_patches_width
|
|
219
260
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
tile_tokens = dwn_num_patches_tile * dwn_num_patches_tile
|
|
261
|
+
tokens_per_tile = self._compute_tokens_per_tile(tile_size, encoder_patch_size, downsample_factor)
|
|
262
|
+
tokens_for_image = self._compute_tokens_for_image(image_size, encoder_patch_size, downsample_factor)
|
|
223
263
|
|
|
224
|
-
return
|
|
264
|
+
return tokens_per_tile, tokens_for_image
|
|
225
265
|
|
|
226
266
|
def batch_decode(self, *args, **kwargs):
|
|
227
267
|
"""
|
|
@@ -174,9 +174,8 @@ class LightGlueImageProcessorFast(BaseImageProcessorFast):
|
|
|
174
174
|
stacked_pairs = [torch.stack(pair, dim=0) for pair in image_pairs]
|
|
175
175
|
|
|
176
176
|
# Return in same format as slow processor
|
|
177
|
-
image_pairs = torch.stack(stacked_pairs, dim=0) if return_tensors else stacked_pairs
|
|
178
177
|
|
|
179
|
-
return BatchFeature(data={"pixel_values":
|
|
178
|
+
return BatchFeature(data={"pixel_values": stacked_pairs}, tensor_type=return_tensors)
|
|
180
179
|
|
|
181
180
|
def post_process_keypoint_matching(
|
|
182
181
|
self,
|
|
@@ -27,6 +27,7 @@ from torch import nn
|
|
|
27
27
|
from torch.nn.utils.rnn import pad_sequence
|
|
28
28
|
|
|
29
29
|
from ...activations import ACT2FN
|
|
30
|
+
from ...integrations import use_kernelized_func
|
|
30
31
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
31
32
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
32
33
|
from ...processing_utils import Unpack
|
|
@@ -174,6 +175,7 @@ def eager_attention_forward(
|
|
|
174
175
|
return attn_output, attn_weights
|
|
175
176
|
|
|
176
177
|
|
|
178
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
177
179
|
class LightGlueAttention(nn.Module):
|
|
178
180
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
179
181
|
|
|
@@ -199,7 +201,6 @@ class LightGlueAttention(nn.Module):
|
|
|
199
201
|
self.o_proj = nn.Linear(
|
|
200
202
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
201
203
|
)
|
|
202
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
203
204
|
|
|
204
205
|
def forward(
|
|
205
206
|
self,
|
|
@@ -870,6 +871,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel):
|
|
|
870
871
|
labels: Optional[torch.LongTensor] = None,
|
|
871
872
|
output_attentions: Optional[bool] = None,
|
|
872
873
|
output_hidden_states: Optional[bool] = None,
|
|
874
|
+
**kwargs,
|
|
873
875
|
) -> Union[tuple, "LightGlueKeypointMatchingOutput"]:
|
|
874
876
|
loss = None
|
|
875
877
|
if labels is not None:
|
|
@@ -927,6 +927,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel):
|
|
|
927
927
|
labels: Optional[torch.LongTensor] = None,
|
|
928
928
|
output_attentions: Optional[bool] = None,
|
|
929
929
|
output_hidden_states: Optional[bool] = None,
|
|
930
|
+
**kwargs,
|
|
930
931
|
) -> Union[tuple, "LightGlueKeypointMatchingOutput"]:
|
|
931
932
|
loss = None
|
|
932
933
|
if labels is not None:
|
|
@@ -21,6 +21,7 @@ import torch
|
|
|
21
21
|
from torch import nn
|
|
22
22
|
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
23
23
|
|
|
24
|
+
from ... import initialization as init
|
|
24
25
|
from ...activations import ACT2FN
|
|
25
26
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
26
27
|
from ...modeling_outputs import (
|
|
@@ -279,11 +280,9 @@ class LiltSelfAttention(nn.Module):
|
|
|
279
280
|
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
|
|
280
281
|
context_layer = context_layer.view(*new_context_layer_shape)
|
|
281
282
|
|
|
282
|
-
outputs = (
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
else ((context_layer, layout_context_layer),)
|
|
286
|
-
)
|
|
283
|
+
outputs = (context_layer, layout_context_layer)
|
|
284
|
+
if output_attentions:
|
|
285
|
+
outputs = outputs + (attention_probs,)
|
|
287
286
|
|
|
288
287
|
return outputs
|
|
289
288
|
|
|
@@ -327,9 +326,9 @@ class LiltAttention(nn.Module):
|
|
|
327
326
|
attention_mask,
|
|
328
327
|
output_attentions,
|
|
329
328
|
)
|
|
330
|
-
attention_output = self.output(self_outputs[0]
|
|
331
|
-
layout_attention_output = self.layout_output(self_outputs[
|
|
332
|
-
outputs = (
|
|
329
|
+
attention_output = self.output(self_outputs[0], hidden_states)
|
|
330
|
+
layout_attention_output = self.layout_output(self_outputs[1], layout_inputs)
|
|
331
|
+
outputs = (attention_output, layout_attention_output) + self_outputs[2:] # add attentions if we output them
|
|
333
332
|
return outputs
|
|
334
333
|
|
|
335
334
|
|
|
@@ -395,10 +394,10 @@ class LiltLayer(GradientCheckpointingLayer):
|
|
|
395
394
|
attention_mask,
|
|
396
395
|
output_attentions=output_attentions,
|
|
397
396
|
)
|
|
398
|
-
attention_output = self_attention_outputs[0]
|
|
399
|
-
layout_attention_output = self_attention_outputs[
|
|
397
|
+
attention_output = self_attention_outputs[0]
|
|
398
|
+
layout_attention_output = self_attention_outputs[1]
|
|
400
399
|
|
|
401
|
-
outputs = self_attention_outputs[
|
|
400
|
+
outputs = self_attention_outputs[2:] # add self attentions if we output attention weights
|
|
402
401
|
|
|
403
402
|
layer_output = apply_chunking_to_forward(
|
|
404
403
|
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
|
|
@@ -406,7 +405,7 @@ class LiltLayer(GradientCheckpointingLayer):
|
|
|
406
405
|
layout_layer_output = apply_chunking_to_forward(
|
|
407
406
|
self.layout_feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, layout_attention_output
|
|
408
407
|
)
|
|
409
|
-
outputs = (
|
|
408
|
+
outputs = (layer_output, layout_layer_output) + outputs
|
|
410
409
|
|
|
411
410
|
return outputs
|
|
412
411
|
|
|
@@ -451,11 +450,11 @@ class LiltEncoder(nn.Module):
|
|
|
451
450
|
output_attentions,
|
|
452
451
|
)
|
|
453
452
|
|
|
454
|
-
hidden_states = layer_outputs[0]
|
|
455
|
-
layout_inputs = layer_outputs[
|
|
453
|
+
hidden_states = layer_outputs[0]
|
|
454
|
+
layout_inputs = layer_outputs[1]
|
|
456
455
|
|
|
457
456
|
if output_attentions:
|
|
458
|
-
all_self_attentions = all_self_attentions + (layer_outputs[
|
|
457
|
+
all_self_attentions = all_self_attentions + (layer_outputs[2],)
|
|
459
458
|
|
|
460
459
|
if output_hidden_states:
|
|
461
460
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
|
@@ -500,6 +499,11 @@ class LiltPreTrainedModel(PreTrainedModel):
|
|
|
500
499
|
supports_gradient_checkpointing = True
|
|
501
500
|
_no_split_modules = []
|
|
502
501
|
|
|
502
|
+
def _init_weights(self, module):
|
|
503
|
+
super()._init_weights(module)
|
|
504
|
+
if isinstance(module, LiltTextEmbeddings):
|
|
505
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
506
|
+
|
|
503
507
|
|
|
504
508
|
@auto_docstring
|
|
505
509
|
class LiltModel(LiltPreTrainedModel):
|
|
@@ -538,6 +542,7 @@ class LiltModel(LiltPreTrainedModel):
|
|
|
538
542
|
output_attentions: Optional[bool] = None,
|
|
539
543
|
output_hidden_states: Optional[bool] = None,
|
|
540
544
|
return_dict: Optional[bool] = None,
|
|
545
|
+
**kwargs,
|
|
541
546
|
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPooling]:
|
|
542
547
|
r"""
|
|
543
548
|
bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
|
|
@@ -665,6 +670,7 @@ class LiltForSequenceClassification(LiltPreTrainedModel):
|
|
|
665
670
|
output_attentions: Optional[bool] = None,
|
|
666
671
|
output_hidden_states: Optional[bool] = None,
|
|
667
672
|
return_dict: Optional[bool] = None,
|
|
673
|
+
**kwargs,
|
|
668
674
|
) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
|
|
669
675
|
r"""
|
|
670
676
|
bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
|
|
@@ -780,6 +786,7 @@ class LiltForTokenClassification(LiltPreTrainedModel):
|
|
|
780
786
|
output_attentions: Optional[bool] = None,
|
|
781
787
|
output_hidden_states: Optional[bool] = None,
|
|
782
788
|
return_dict: Optional[bool] = None,
|
|
789
|
+
**kwargs,
|
|
783
790
|
) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
|
|
784
791
|
r"""
|
|
785
792
|
bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
|
|
@@ -897,6 +904,7 @@ class LiltForQuestionAnswering(LiltPreTrainedModel):
|
|
|
897
904
|
output_attentions: Optional[bool] = None,
|
|
898
905
|
output_hidden_states: Optional[bool] = None,
|
|
899
906
|
return_dict: Optional[bool] = None,
|
|
907
|
+
**kwargs,
|
|
900
908
|
) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
|
|
901
909
|
r"""
|
|
902
910
|
bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
|
|
@@ -26,7 +26,7 @@ from torch import nn
|
|
|
26
26
|
from ...activations import ACT2FN
|
|
27
27
|
from ...cache_utils import Cache, DynamicCache
|
|
28
28
|
from ...generation import GenerationMixin
|
|
29
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
|
|
29
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
30
30
|
from ...masking_utils import create_causal_mask
|
|
31
31
|
from ...modeling_layers import (
|
|
32
32
|
GenericForQuestionAnswering,
|
|
@@ -42,7 +42,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
42
42
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
43
43
|
from ...processing_utils import Unpack
|
|
44
44
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
|
45
|
-
from ...utils.generic import check_model_inputs
|
|
45
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
46
46
|
from .configuration_llama import LlamaConfig
|
|
47
47
|
|
|
48
48
|
|
|
@@ -87,7 +87,7 @@ class LlamaRotaryEmbedding(nn.Module):
|
|
|
87
87
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
88
88
|
|
|
89
89
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
90
|
-
self.original_inv_freq =
|
|
90
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
91
91
|
|
|
92
92
|
@staticmethod
|
|
93
93
|
def compute_default_rope_parameters(
|
|
@@ -126,7 +126,7 @@ class LlamaRotaryEmbedding(nn.Module):
|
|
|
126
126
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
127
127
|
|
|
128
128
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
129
|
-
with
|
|
129
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
130
130
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
131
131
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
132
132
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -224,6 +224,7 @@ def eager_attention_forward(
|
|
|
224
224
|
return attn_output, attn_weights
|
|
225
225
|
|
|
226
226
|
|
|
227
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
227
228
|
class LlamaAttention(nn.Module):
|
|
228
229
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
229
230
|
|
|
@@ -249,7 +250,6 @@ class LlamaAttention(nn.Module):
|
|
|
249
250
|
self.o_proj = nn.Linear(
|
|
250
251
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
251
252
|
)
|
|
252
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
253
253
|
|
|
254
254
|
def forward(
|
|
255
255
|
self,
|
|
@@ -12,11 +12,12 @@
|
|
|
12
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
|
+
from typing import Optional, Union
|
|
15
16
|
|
|
16
|
-
from tokenizers import
|
|
17
|
+
from tokenizers import Tokenizer, decoders, pre_tokenizers
|
|
17
18
|
from tokenizers.models import BPE
|
|
18
19
|
|
|
19
|
-
from ...tokenization_utils_base import _get_prepend_scheme
|
|
20
|
+
from ...tokenization_utils_base import _get_prepend_scheme
|
|
20
21
|
from ...tokenization_utils_tokenizers import TokenizersBackend
|
|
21
22
|
from ...utils import logging
|
|
22
23
|
|
|
@@ -61,6 +62,10 @@ class LlamaTokenizer(TokenizersBackend):
|
|
|
61
62
|
refer to this superclass for more information regarding those methods.
|
|
62
63
|
|
|
63
64
|
Args:
|
|
65
|
+
vocab (`str`, `dict` or `list`, *optional*):
|
|
66
|
+
Path to the vocabulary file, a dictionary or a list of tokens.
|
|
67
|
+
merges (`str` or `list`, *optional*):
|
|
68
|
+
Path to the merges file or a list of merges.
|
|
64
69
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
|
|
65
70
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
|
66
71
|
extra spaces.
|
|
@@ -84,42 +89,32 @@ class LlamaTokenizer(TokenizersBackend):
|
|
|
84
89
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
85
90
|
padding_side = "left"
|
|
86
91
|
model_input_names = ["input_ids", "attention_mask"]
|
|
92
|
+
model = BPE
|
|
87
93
|
|
|
88
94
|
def __init__(
|
|
89
95
|
self,
|
|
96
|
+
vocab: Optional[Union[str, dict, list]] = None,
|
|
97
|
+
merges: Optional[Union[str, list]] = None,
|
|
90
98
|
clean_up_tokenization_spaces=False,
|
|
91
99
|
unk_token="<unk>",
|
|
92
100
|
bos_token="<s>",
|
|
93
101
|
eos_token="</s>",
|
|
94
|
-
add_bos_token=True,
|
|
95
|
-
add_eos_token=False,
|
|
96
102
|
use_default_system_prompt=False,
|
|
97
103
|
legacy=False,
|
|
98
104
|
add_prefix_space=None,
|
|
99
|
-
vocab=None,
|
|
100
|
-
merges=None,
|
|
101
105
|
**kwargs,
|
|
102
106
|
):
|
|
103
107
|
self.add_prefix_space = add_prefix_space if add_prefix_space is not None else True
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
{token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
|
|
108
|
-
)
|
|
109
|
-
else:
|
|
108
|
+
self.legacy = legacy
|
|
109
|
+
self._vocab = vocab
|
|
110
|
+
if vocab is None:
|
|
110
111
|
self._vocab = {
|
|
111
112
|
str(unk_token): 0,
|
|
112
113
|
str(bos_token): 1,
|
|
113
114
|
str(eos_token): 2,
|
|
114
115
|
}
|
|
115
116
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
filtered_vocab = {t: i for t, i in self._vocab.items() if t not in special_tokens}
|
|
119
|
-
if merges is not None:
|
|
120
|
-
self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
|
|
121
|
-
else:
|
|
122
|
-
self._merges = generate_merges(filtered_vocab)
|
|
117
|
+
self._merges = merges or []
|
|
123
118
|
self._tokenizer = Tokenizer(
|
|
124
119
|
BPE(vocab=self._vocab, merges=self._merges, fuse_unk=True, byte_fallback=True, dropout=None)
|
|
125
120
|
)
|
|
@@ -138,40 +133,17 @@ class LlamaTokenizer(TokenizersBackend):
|
|
|
138
133
|
sequence += [decoders.Strip(content=" ", left=1)]
|
|
139
134
|
|
|
140
135
|
self._tokenizer.decoder = decoders.Sequence(sequence)
|
|
141
|
-
|
|
142
|
-
|
|
136
|
+
self.use_default_system_prompt = use_default_system_prompt
|
|
143
137
|
super().__init__(
|
|
144
|
-
tokenizer_object=tokenizer_object,
|
|
145
138
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
146
139
|
unk_token=unk_token,
|
|
147
140
|
bos_token=bos_token,
|
|
148
141
|
eos_token=eos_token,
|
|
149
|
-
add_bos_token=add_bos_token,
|
|
150
|
-
add_eos_token=add_eos_token,
|
|
151
142
|
use_default_system_prompt=use_default_system_prompt,
|
|
152
143
|
add_prefix_space=add_prefix_space,
|
|
153
144
|
**kwargs,
|
|
154
145
|
)
|
|
155
146
|
|
|
156
|
-
self._add_bos_token = add_bos_token
|
|
157
|
-
self._add_eos_token = add_eos_token
|
|
158
|
-
self.use_default_system_prompt = use_default_system_prompt
|
|
159
|
-
|
|
160
|
-
self._post_init()
|
|
161
|
-
|
|
162
|
-
def _post_init(self):
|
|
163
|
-
"""Post-initialization setup that needs to run after _tokenizer is set."""
|
|
164
|
-
# Only set pre_tokenizer/normalizer for Llama-3 style tokenizers (use Sequence)
|
|
165
|
-
pre_tok = self._tokenizer.pre_tokenizer
|
|
166
|
-
if pre_tok is None or type(pre_tok).__name__ != "Sequence":
|
|
167
|
-
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
|
168
|
-
replacement="▁", prepend_scheme="first", split=False
|
|
169
|
-
)
|
|
170
|
-
self._tokenizer.normalizer = None
|
|
171
|
-
self.add_tokens([AddedToken(token, special=True) for token in self.all_special_tokens])
|
|
172
|
-
super()._post_init()
|
|
173
|
-
self.update_post_processor()
|
|
174
|
-
|
|
175
147
|
|
|
176
148
|
__all__ = ["LlamaTokenizer", "LlamaTokenizerFast"]
|
|
177
149
|
|
|
@@ -419,10 +419,9 @@ class Llama4ImageProcessorFast(BaseImageProcessorFast):
|
|
|
419
419
|
)
|
|
420
420
|
grouped_processed_images[shape] = torch.cat([processed_images, global_tiles.unsqueeze(1)], dim=1)
|
|
421
421
|
processed_images = reorder_images(grouped_processed_images, grouped_images_index)
|
|
422
|
-
|
|
422
|
+
aspect_ratios = reorder_images(grouped_aspect_ratios, grouped_images_index)
|
|
423
423
|
|
|
424
424
|
processed_images = torch.cat(processed_images, dim=0) if return_tensors else processed_images
|
|
425
|
-
aspect_ratios = torch.stack(aspect_ratios_list, dim=0) if return_tensors else aspect_ratios_list
|
|
426
425
|
return BatchFeature(
|
|
427
426
|
data={"pixel_values": processed_images, "aspect_ratios": aspect_ratios}, tensor_type=return_tensors
|
|
428
427
|
)
|
|
@@ -40,7 +40,7 @@ from ...modeling_rope_utils import (
|
|
|
40
40
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
41
41
|
from ...processing_utils import Unpack
|
|
42
42
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
|
43
|
-
from ...utils.generic import check_model_inputs
|
|
43
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
44
44
|
from .configuration_llama4 import Llama4Config, Llama4TextConfig
|
|
45
45
|
|
|
46
46
|
|
|
@@ -188,7 +188,7 @@ class Llama4TextRotaryEmbedding(nn.Module):
|
|
|
188
188
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
189
189
|
|
|
190
190
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
191
|
-
self.original_inv_freq =
|
|
191
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
192
192
|
|
|
193
193
|
@staticmethod
|
|
194
194
|
def compute_default_rope_parameters(
|
|
@@ -228,7 +228,7 @@ class Llama4TextRotaryEmbedding(nn.Module):
|
|
|
228
228
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
229
229
|
|
|
230
230
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
231
|
-
with
|
|
231
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
232
232
|
freqs = (inv_freq_expanded.to(x.device) @ position_ids_expanded).transpose(1, 2)
|
|
233
233
|
freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # Convert to complex representation
|
|
234
234
|
freqs_cis = freqs_cis * self.attention_scaling
|
|
@@ -1072,6 +1072,7 @@ class Llama4VisionModel(Llama4PreTrainedModel):
|
|
|
1072
1072
|
output_attentions: Optional[bool] = None,
|
|
1073
1073
|
output_hidden_states: Optional[bool] = None,
|
|
1074
1074
|
return_dict: Optional[bool] = None,
|
|
1075
|
+
**kwargs,
|
|
1075
1076
|
) -> Union[BaseModelOutput, tuple[torch.Tensor, ...]]:
|
|
1076
1077
|
r"""
|
|
1077
1078
|
|
|
@@ -1386,6 +1387,7 @@ class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
|
|
|
1386
1387
|
attention_mask=None,
|
|
1387
1388
|
cache_position=None,
|
|
1388
1389
|
logits_to_keep=None,
|
|
1390
|
+
is_first_iteration=False,
|
|
1389
1391
|
**kwargs,
|
|
1390
1392
|
):
|
|
1391
1393
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1397,12 +1399,15 @@ class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
|
|
|
1397
1399
|
attention_mask=attention_mask,
|
|
1398
1400
|
cache_position=cache_position,
|
|
1399
1401
|
logits_to_keep=logits_to_keep,
|
|
1402
|
+
is_first_iteration=is_first_iteration,
|
|
1400
1403
|
**kwargs,
|
|
1401
1404
|
)
|
|
1402
1405
|
|
|
1403
|
-
if
|
|
1404
|
-
#
|
|
1405
|
-
#
|
|
1406
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
1407
|
+
# Pixel values are used only in the first iteration if available
|
|
1408
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
1409
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
1410
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
1406
1411
|
model_inputs["pixel_values"] = pixel_values
|
|
1407
1412
|
|
|
1408
1413
|
return model_inputs
|
|
@@ -149,7 +149,6 @@ class LlavaImageProcessorFast(BaseImageProcessorFast):
|
|
|
149
149
|
processed_images_grouped[shape] = stacked_images
|
|
150
150
|
|
|
151
151
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
152
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
153
152
|
|
|
154
153
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
155
154
|
|
|
@@ -202,10 +202,11 @@ class LlavaModel(LlavaPreTrainedModel):
|
|
|
202
202
|
image_features = self.multi_modal_projector(selected_image_feature)
|
|
203
203
|
|
|
204
204
|
if "image_sizes" in kwargs:
|
|
205
|
-
split_sizes =
|
|
206
|
-
(
|
|
207
|
-
|
|
208
|
-
|
|
205
|
+
split_sizes = (
|
|
206
|
+
(torch.as_tensor(kwargs["image_sizes"], device=image_features.device) // self.vision_tower.patch_size)
|
|
207
|
+
.prod(dim=-1)
|
|
208
|
+
.tolist()
|
|
209
|
+
)
|
|
209
210
|
image_features = torch.split(image_features.squeeze(0), split_sizes)
|
|
210
211
|
else:
|
|
211
212
|
image_features = list(image_features)
|
|
@@ -437,6 +438,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
|
|
|
437
438
|
attention_mask=None,
|
|
438
439
|
cache_position=None,
|
|
439
440
|
logits_to_keep=None,
|
|
441
|
+
is_first_iteration=False,
|
|
440
442
|
**kwargs,
|
|
441
443
|
):
|
|
442
444
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -448,12 +450,15 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
|
|
|
448
450
|
attention_mask=attention_mask,
|
|
449
451
|
cache_position=cache_position,
|
|
450
452
|
logits_to_keep=logits_to_keep,
|
|
453
|
+
is_first_iteration=is_first_iteration,
|
|
451
454
|
**kwargs,
|
|
452
455
|
)
|
|
453
456
|
|
|
454
|
-
if
|
|
455
|
-
#
|
|
456
|
-
#
|
|
457
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
458
|
+
# Pixel values are used only in the first iteration if available
|
|
459
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
460
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
461
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
457
462
|
model_inputs["pixel_values"] = pixel_values
|
|
458
463
|
|
|
459
464
|
return model_inputs
|
|
@@ -260,7 +260,6 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast):
|
|
|
260
260
|
|
|
261
261
|
if do_pad:
|
|
262
262
|
processed_images = self._pad_for_batching(processed_images)
|
|
263
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
264
263
|
return BatchFeature(
|
|
265
264
|
data={"pixel_values": processed_images, "image_sizes": image_sizes}, tensor_type=return_tensors
|
|
266
265
|
)
|
|
@@ -692,6 +692,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
|
|
|
692
692
|
attention_mask=None,
|
|
693
693
|
cache_position=None,
|
|
694
694
|
logits_to_keep=None,
|
|
695
|
+
is_first_iteration=False,
|
|
695
696
|
**kwargs,
|
|
696
697
|
):
|
|
697
698
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -703,12 +704,15 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
|
|
|
703
704
|
attention_mask=attention_mask,
|
|
704
705
|
cache_position=cache_position,
|
|
705
706
|
logits_to_keep=logits_to_keep,
|
|
707
|
+
is_first_iteration=is_first_iteration,
|
|
706
708
|
**kwargs,
|
|
707
709
|
)
|
|
708
710
|
|
|
709
|
-
#
|
|
710
|
-
#
|
|
711
|
-
|
|
711
|
+
# Pixel values are used only in the first iteration if available
|
|
712
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
713
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
714
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
715
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
712
716
|
model_inputs["pixel_values"] = pixel_values
|
|
713
717
|
model_inputs["image_sizes"] = image_sizes
|
|
714
718
|
|