transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -15,12 +15,16 @@
|
|
|
15
15
|
from .cache import PagedAttentionCache
|
|
16
16
|
from .continuous_api import ContinuousBatchingManager, ContinuousMixin
|
|
17
17
|
from .requests import RequestState, RequestStatus
|
|
18
|
+
from .scheduler import FIFOScheduler, PrefillFirstScheduler, Scheduler
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
__all__ = [
|
|
21
22
|
"ContinuousBatchingManager",
|
|
22
23
|
"ContinuousMixin",
|
|
24
|
+
"FIFOScheduler",
|
|
23
25
|
"PagedAttentionCache",
|
|
26
|
+
"PrefillFirstScheduler",
|
|
24
27
|
"RequestState",
|
|
25
28
|
"RequestStatus",
|
|
29
|
+
"Scheduler",
|
|
26
30
|
]
|
|
@@ -121,7 +121,7 @@ class PagedAttentionCache:
|
|
|
121
121
|
device: torch.device,
|
|
122
122
|
dtype: torch.dtype = torch.float16,
|
|
123
123
|
tp_size: int | None = None,
|
|
124
|
-
|
|
124
|
+
allow_block_sharing: bool = True,
|
|
125
125
|
) -> None:
|
|
126
126
|
"""Initialize a paged attention cache for efficient memory usage. Also turns in prefix sharing if the model has
|
|
127
127
|
only full attention layers.
|
|
@@ -132,7 +132,8 @@ class PagedAttentionCache:
|
|
|
132
132
|
device: Device for the cache tensors
|
|
133
133
|
dtype: Data type of the cache
|
|
134
134
|
tp_size: Tensor parallelism size
|
|
135
|
-
|
|
135
|
+
allow_block_sharing: A flag to allow block sharing. If the model has some full attention layers, then prefix
|
|
136
|
+
sharing is enabled as well.
|
|
136
137
|
"""
|
|
137
138
|
self.config = config
|
|
138
139
|
self.dtype = dtype
|
|
@@ -209,7 +210,7 @@ class PagedAttentionCache:
|
|
|
209
210
|
self.key_cache: list[torch.Tensor] = []
|
|
210
211
|
self.value_cache: list[torch.Tensor] = []
|
|
211
212
|
# We add two extra tokens to the cache to handle padding and generally discard unwanted tokens
|
|
212
|
-
self.cache_shape = (num_blocks * self.block_size
|
|
213
|
+
self.cache_shape = ((num_blocks + 2) * self.block_size, self.num_key_value_heads, self.head_dim)
|
|
213
214
|
for _ in range(group_size):
|
|
214
215
|
new_layer_key_cache = torch.empty(self.cache_shape, dtype=self.dtype, device=self.device)
|
|
215
216
|
new_layer_value_cache = torch.empty(self.cache_shape, dtype=self.dtype, device=self.device)
|
|
@@ -220,19 +221,20 @@ class PagedAttentionCache:
|
|
|
220
221
|
logger.info(f"{self.cache_shape = } {self.key_cache[0].shape = } {self.key_cache[0].numel() = }")
|
|
221
222
|
|
|
222
223
|
# Block management data structures
|
|
224
|
+
self.allow_block_sharing = allow_block_sharing
|
|
223
225
|
self.group_cache_managers: list[CacheAllocator] = []
|
|
224
226
|
for i, group_type in enumerate(group_types):
|
|
225
227
|
if group_type == "full_attention":
|
|
226
|
-
cm = FullAttentionCacheAllocator(i, self.block_size)
|
|
228
|
+
cm = FullAttentionCacheAllocator(i, self.block_size, allow_block_sharing=allow_block_sharing)
|
|
227
229
|
elif group_type == "sliding_attention":
|
|
228
230
|
cm = SlidingAttentionCacheAllocator(i, self.block_size, config.sliding_window)
|
|
229
231
|
else:
|
|
230
232
|
raise ValueError(f"Invalid group type: {group_type}")
|
|
231
233
|
self.group_cache_managers.append(cm)
|
|
232
234
|
|
|
233
|
-
# We only use prefix sharing if the whole model has only full attention layers
|
|
234
|
-
self.use_prefix_sharing =
|
|
235
|
-
self._block_manager = BlockManager(num_blocks, self.block_size
|
|
235
|
+
# We only use prefix sharing if the whole model has only full attention layers and block sharing is allowed
|
|
236
|
+
self.use_prefix_sharing = allow_block_sharing and group_types == ["full_attention"]
|
|
237
|
+
self._block_manager = BlockManager(num_blocks, self.block_size)
|
|
236
238
|
self.blocks_to_complete: dict[str, int] = {}
|
|
237
239
|
self._total_prefix_length: int = 0 # a counter to measure the impact of prefix sharing, also used in tests
|
|
238
240
|
|
|
@@ -352,7 +354,8 @@ class PagedAttentionCache:
|
|
|
352
354
|
allocated_blocks = []
|
|
353
355
|
for b in range(len(prompt_ids) // self.block_size):
|
|
354
356
|
tokens = prompt_ids[b * self.block_size : (b + 1) * self.block_size]
|
|
355
|
-
|
|
357
|
+
# Prefix sharing is only supported when there is only one full attention layer group, so group_id=0.
|
|
358
|
+
current_hash = self._block_manager.compute_hash(current_hash, tokens, group_id=0)
|
|
356
359
|
block_id = self._block_manager._hash_to_id.get(current_hash)
|
|
357
360
|
if block_id is not None:
|
|
358
361
|
allocated_blocks.append(block_id)
|
|
@@ -369,18 +372,44 @@ class PagedAttentionCache:
|
|
|
369
372
|
self._total_prefix_length += prefix_length
|
|
370
373
|
return prefix_length
|
|
371
374
|
|
|
372
|
-
def
|
|
373
|
-
"""Marks the blocks
|
|
374
|
-
a
|
|
375
|
-
|
|
375
|
+
def mark_shareable_blocks_as_complete(self, state: RequestState) -> None:
|
|
376
|
+
"""Marks the blocks allocated to a request (state) as complete if they are shareable and they have been computed
|
|
377
|
+
in the forward pass. A complete block is a block where the KV cache has been fully computed: if the block has
|
|
378
|
+
enough space to hold the cache for N tokens, the block is marked as complete when the cache data is present for
|
|
379
|
+
the N tokens. If block sharing is off, this is a no-op."""
|
|
380
|
+
num_complete_blocks = 0 if not self.allow_block_sharing else self.blocks_to_complete.pop(state.request_id)
|
|
376
381
|
if num_complete_blocks == 0:
|
|
377
382
|
return None
|
|
378
|
-
cm
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
383
|
+
for cm in self.group_cache_managers:
|
|
384
|
+
if cm.uses_block_sharing:
|
|
385
|
+
self._block_manager.mark_shareable_blocks_as_complete(
|
|
386
|
+
num_complete_blocks=num_complete_blocks,
|
|
387
|
+
allocated_blocks=cm.block_table[state.request_id],
|
|
388
|
+
prompt_ids=(state.initial_tokens + state.generated_tokens),
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
def copy_cache(self, source_blocks: list[int], forked_blocks: list[int]) -> None:
|
|
392
|
+
"""Copy the cache from the source blocks to the forked blocks."""
|
|
393
|
+
source_blocks = torch.tensor(source_blocks, device=self.device, dtype=torch.int32)
|
|
394
|
+
forked_blocks = torch.tensor(forked_blocks, device=self.device, dtype=torch.int32)
|
|
395
|
+
for key_cache, value_cache in zip(self.key_cache, self.value_cache):
|
|
396
|
+
key_cache = key_cache.view(-1, self.block_size, self.num_key_value_heads, self.head_dim)
|
|
397
|
+
value_cache = value_cache.view(-1, self.block_size, self.num_key_value_heads, self.head_dim)
|
|
398
|
+
key_cache[forked_blocks] = key_cache[source_blocks]
|
|
399
|
+
value_cache[forked_blocks] = value_cache[source_blocks]
|
|
400
|
+
# FIXME: consolidate the cache into a single tensor of shape (group_size, 2, *self.k_or_v_cache_shape)
|
|
401
|
+
# This will allow for better .update and a single copy instead of one per cache tensor
|
|
402
|
+
|
|
403
|
+
def fork_request(self, source_request_id: str, destination_request_ids: list[str]) -> tuple[list[int], list[int]]:
|
|
404
|
+
"""Fork the cache of a request (state) into the one of a list of requests with the given (dst_request_ids)."""
|
|
405
|
+
# These lists will be the accumulators for the source and destination blocks for the cache copy
|
|
406
|
+
source_blocks, destination_blocks = [], []
|
|
407
|
+
# Main fork loop
|
|
408
|
+
for cm in self.group_cache_managers:
|
|
409
|
+
src_blocks, dst_blocks = cm.fork_blocks(source_request_id, destination_request_ids, self._block_manager)
|
|
410
|
+
source_blocks.extend(src_blocks)
|
|
411
|
+
destination_blocks.extend(dst_blocks)
|
|
412
|
+
return source_blocks, destination_blocks
|
|
384
413
|
|
|
385
414
|
|
|
386
415
|
# TODO: rework computation with the groups and their sizes
|
|
@@ -31,20 +31,21 @@ def reverse_enumerate(xs: list[T]) -> Iterator[tuple[int, T]]:
|
|
|
31
31
|
index -= 1
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
class Block:
|
|
34
|
+
class Block: # TODO: rename to ShareableBlock and update the docs
|
|
35
35
|
"""A class to represent a block managed by the block manager. We say that a block is complete when the physical KV
|
|
36
36
|
cache it points to is fully computed. A block can have a parent, which is the block that came before in the
|
|
37
|
-
sequence. Once a block is complete, it is given a hash, which takes into account the tokens ids of the block
|
|
38
|
-
its parent's hash (if there is a parent)."""
|
|
37
|
+
sequence. Once a block is complete, it is given a hash, which takes into account the tokens ids of the block, the
|
|
38
|
+
layer (group_id) it belong to and its parent's hash (if there is a parent)."""
|
|
39
39
|
|
|
40
|
-
def __init__(self, id_: int, parent_id: int | None) -> None:
|
|
40
|
+
def __init__(self, id_: int, parent_id: int | None, group_id: int) -> None:
|
|
41
41
|
self.id: int = id_
|
|
42
42
|
self.parent_id: int | None = parent_id
|
|
43
|
+
self.group_id: int = group_id
|
|
43
44
|
self.hash: int | None = None
|
|
44
45
|
self.ref_count: int = 1
|
|
45
46
|
|
|
46
47
|
def __repr__(self) -> str:
|
|
47
|
-
return f"Block(id={self.id}, parent_id={self.parent_id}, hash={self.hash}, ref_count={self.ref_count})"
|
|
48
|
+
return f"Block(id={self.id}, parent_id={self.parent_id}, group_id={self.group_id}, hash={self.hash}, ref_count={self.ref_count})"
|
|
48
49
|
|
|
49
50
|
@property
|
|
50
51
|
def is_complete(self) -> bool:
|
|
@@ -52,8 +53,9 @@ class Block:
|
|
|
52
53
|
|
|
53
54
|
|
|
54
55
|
class BlockManager:
|
|
55
|
-
"""A class to manage the number of free blocks and block re-use.
|
|
56
|
-
|
|
56
|
+
"""A class to manage the number of free blocks and block re-use. When a block becomes in use, a flag is passed to
|
|
57
|
+
determine if the block is shareable or not. If it is, then a Block object is created and kept track of internally.
|
|
58
|
+
It can have the following states:
|
|
57
59
|
- in use: one or more requests references this block, thus it cannot be written over. The number of requests
|
|
58
60
|
referencing this block is stored as ref_count in the Block object.
|
|
59
61
|
- un-initialized: the block points to a space in the KV cache tensor that contains no data yet. Those blocks can
|
|
@@ -63,19 +65,19 @@ class BlockManager:
|
|
|
63
65
|
the ref_count of the block and remove it from the list of initialized blocks, because it is now in use.
|
|
64
66
|
Still, the block can be freed if no un-initialized blocks are left. In that case, we remove its hash from the
|
|
65
67
|
hash table.
|
|
68
|
+
If the block is not shareable, we just use the block manager as a FIFO structure where blocks are either free or in
|
|
69
|
+
use. Sharability is determined by the type of cache allocator: blocks created for full attention layers are
|
|
70
|
+
shareable, while blocks created for sliding window attention layers are not.
|
|
66
71
|
There is no structure to keep track of the blocks in use: if a block is neither un-initialized nor initialized,
|
|
67
72
|
it is in use.
|
|
68
73
|
"""
|
|
69
74
|
|
|
70
|
-
def __init__(self, num_blocks: int, block_size: int
|
|
71
|
-
"""Initializes the block manager with a given number of blocks (num_blocks) of size (block_size).
|
|
72
|
-
can be turned on with the (use_prefix_sharing) flag, which only happens if the model has only full attention
|
|
73
|
-
layers."""
|
|
75
|
+
def __init__(self, num_blocks: int, block_size: int) -> None:
|
|
76
|
+
"""Initializes the block manager with a given number of blocks (num_blocks) of size (block_size)."""
|
|
74
77
|
self.num_blocks = num_blocks
|
|
75
78
|
self.block_size = block_size
|
|
76
79
|
self._uninit_block_ids = deque(range(num_blocks))
|
|
77
80
|
self._init_block_ids: dict[int, None] = {} # effectively act as an ordered set
|
|
78
|
-
self._use_prefix_sharing = use_prefix_sharing
|
|
79
81
|
self._hash_to_id: dict[int, int] = {}
|
|
80
82
|
self._id_to_block: dict[int, Block] = {}
|
|
81
83
|
|
|
@@ -102,22 +104,81 @@ class BlockManager:
|
|
|
102
104
|
self._uninit_block_ids.append(id_to_uninitialize)
|
|
103
105
|
return True
|
|
104
106
|
|
|
105
|
-
def get_free_blocks(
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
107
|
+
def get_free_blocks(
|
|
108
|
+
self, n_blocks: int, last_block_id: int | None, shareable: bool, group_id: int
|
|
109
|
+
) -> list[int] | None:
|
|
110
|
+
"""Returns a list of (n_blocks) free block and mark them as no longuer free in the internal data structures.
|
|
111
|
+
If the (shareable) flag is set to True, a Block object is created to keep track of the block, with the
|
|
112
|
+
(last_block_id) to indicate the last block id in the sequence, also named the parent block. If the manager
|
|
113
|
+
cannot find enough free blocks, it returns None."""
|
|
109
114
|
if not self.has_enough_free_blocks(n_blocks):
|
|
110
115
|
return None
|
|
111
116
|
allocated_block_ids = [self._uninit_block_ids.popleft() for _ in range(n_blocks)]
|
|
112
|
-
# If
|
|
113
|
-
if
|
|
117
|
+
# If the block is shareable, we keep track of the allocated blocks as partial blocks
|
|
118
|
+
if shareable:
|
|
114
119
|
for block_id in allocated_block_ids:
|
|
115
|
-
block = Block(block_id, last_block_id)
|
|
120
|
+
block = Block(block_id, last_block_id, group_id)
|
|
116
121
|
self._id_to_block[block_id] = block
|
|
117
122
|
last_block_id = block_id
|
|
118
123
|
# In both cases, we return the allocated block ids
|
|
119
124
|
return allocated_block_ids
|
|
120
125
|
|
|
126
|
+
def fork_blocks(
|
|
127
|
+
self, parent_blocks: list[int], num_forks: int, shareable: bool, group_id: int
|
|
128
|
+
) -> tuple[list[list[int]], list[int], list[int]]:
|
|
129
|
+
"""Fork a given list of (parent_blocks) as many times as (num_forks). If the blocks are (shareable), we use
|
|
130
|
+
reference on the blocks that are complete. Otherwise, we allocate new blocks and keep track of their indices to
|
|
131
|
+
later copy the physical cache. For instance, when forking 4 blocks for 2 children:
|
|
132
|
+
|
|
133
|
+
Parent blocks: [0, 1, 2, 3], with all blocks being complete except the last one (block 3).
|
|
134
|
+
|
|
135
|
+
----------------------------------------- IF BLOCKS ARE NOT SHAREABLE -----------------------------------------
|
|
136
|
+
|
|
137
|
+
Forked blocks lists: [[5, 6, 7, 8], [9, 10, 11, 12]]
|
|
138
|
+
Copy source: [0, 1, 2, 3, 0, 1, 2, 3]
|
|
139
|
+
↓ ↓ ↓ ↓ ↓ ↓ ↓ ↓
|
|
140
|
+
Copy destination: [5, 6, 7, 8, 9, 10, 11, 12] → 8 blocks are newly allocated and copied
|
|
141
|
+
|
|
142
|
+
----------------------------------------- IF BLOCKS ARE SHAREABLE ---------------------------------------------
|
|
143
|
+
|
|
144
|
+
Forked blocks lists: [[0, 1, 2, 5], [0, 1, 2, 6]]
|
|
145
|
+
Copy source: [ 3, 3] (block 3 is not complete so it's copied, not referenced)
|
|
146
|
+
↓ ↓
|
|
147
|
+
Copy destination: [ 5, 6] → only 2 blocks are newly allocated and copied
|
|
148
|
+
"""
|
|
149
|
+
# First phase: reference all complete blocks
|
|
150
|
+
forked_by_reference = []
|
|
151
|
+
|
|
152
|
+
if shareable:
|
|
153
|
+
for block_id in parent_blocks:
|
|
154
|
+
block = self._id_to_block[block_id]
|
|
155
|
+
if block.is_complete:
|
|
156
|
+
forked_by_reference.append(block.id)
|
|
157
|
+
block.ref_count += num_forks
|
|
158
|
+
else:
|
|
159
|
+
break
|
|
160
|
+
|
|
161
|
+
# Early return if we have forked all blocks by reference
|
|
162
|
+
blocks_to_copy = len(parent_blocks) - len(forked_by_reference)
|
|
163
|
+
if blocks_to_copy == 0:
|
|
164
|
+
return [forked_by_reference[:] for _ in range(num_forks)], [], []
|
|
165
|
+
|
|
166
|
+
# From now on, each child will have its own list of blocks
|
|
167
|
+
forked_blocks_lists = []
|
|
168
|
+
copy_src = []
|
|
169
|
+
copy_dst = []
|
|
170
|
+
|
|
171
|
+
# Second phase: allocate new blocks if needed
|
|
172
|
+
parent_id = forked_by_reference[-1] if forked_by_reference else None
|
|
173
|
+
for _ in range(num_forks):
|
|
174
|
+
allocated_block_ids = self.get_free_blocks(blocks_to_copy, parent_id, shareable, group_id)
|
|
175
|
+
if allocated_block_ids is None:
|
|
176
|
+
return None, [], []
|
|
177
|
+
forked_blocks_lists.append(forked_by_reference + allocated_block_ids)
|
|
178
|
+
copy_src.extend(parent_blocks[-blocks_to_copy:])
|
|
179
|
+
copy_dst.extend(allocated_block_ids)
|
|
180
|
+
return forked_blocks_lists, copy_src, copy_dst
|
|
181
|
+
|
|
121
182
|
def increase_ref_count(self, block_id: int) -> None:
|
|
122
183
|
"""Increases the reference count of a given (block_id)."""
|
|
123
184
|
block = self._id_to_block[block_id]
|
|
@@ -137,23 +198,23 @@ class BlockManager:
|
|
|
137
198
|
self._id_to_block.pop(block_id)
|
|
138
199
|
self._uninit_block_ids.append(block_id)
|
|
139
200
|
|
|
140
|
-
def free_blocks(self, blocks: list[int]) -> None:
|
|
141
|
-
"""Marks a list of (blocks) as free. If
|
|
201
|
+
def free_blocks(self, blocks: list[int], shareable: bool) -> None:
|
|
202
|
+
"""Marks a list of (blocks) as free. If the blocks were not (shareable), we simply add them to the uninitialized
|
|
142
203
|
blocks queue. Otherwise, their new state depends on whether they are complete."""
|
|
143
|
-
if
|
|
204
|
+
if shareable:
|
|
144
205
|
for block_id in blocks:
|
|
145
206
|
self.decrease_ref_count(block_id)
|
|
146
207
|
else:
|
|
147
208
|
self._uninit_block_ids.extend(blocks)
|
|
148
209
|
|
|
149
|
-
def
|
|
210
|
+
def mark_shareable_blocks_as_complete(
|
|
150
211
|
self, num_complete_blocks: int, allocated_blocks: list[int], prompt_ids: list[int]
|
|
151
212
|
) -> None:
|
|
152
213
|
"""Among the list of (allocated_blocks), mark (num_complete_blocks) incomplete blocks as now complete. The list
|
|
153
214
|
of (prompt_ids) is used to compute the hash of the new block."""
|
|
154
215
|
# Look for the first complete block, starting from the last block in the sequence
|
|
155
216
|
parent_hash = None
|
|
156
|
-
incomplete_blocks: list[Block] = []
|
|
217
|
+
incomplete_blocks: list[tuple[int, Block]] = []
|
|
157
218
|
for i, block_id in reverse_enumerate(allocated_blocks):
|
|
158
219
|
block = self._id_to_block[block_id]
|
|
159
220
|
if block.is_complete:
|
|
@@ -178,7 +239,7 @@ class BlockManager:
|
|
|
178
239
|
# Otherwise, we compute the hash
|
|
179
240
|
num_complete_blocks -= 1
|
|
180
241
|
tokens = prompt_ids[i * self.block_size : (i + 1) * self.block_size]
|
|
181
|
-
block.hash = self.compute_hash(parent_hash, tokens)
|
|
242
|
+
block.hash = self.compute_hash(parent_hash, tokens, block.group_id)
|
|
182
243
|
|
|
183
244
|
existing_block_id = self._hash_to_id.get(block.hash)
|
|
184
245
|
# If the block hash is already in the hash to id mapping, we reference the existing block instead
|
|
@@ -187,19 +248,20 @@ class BlockManager:
|
|
|
187
248
|
allocated_blocks[i] = existing_block_id
|
|
188
249
|
self._id_to_block[existing_block_id].ref_count += 1
|
|
189
250
|
new_parent_id = existing_block_id
|
|
190
|
-
self.free_blocks([block.id])
|
|
251
|
+
self.free_blocks([block.id], shareable=True)
|
|
191
252
|
|
|
192
253
|
# Otherwise, we add the completed block to the hash table
|
|
193
254
|
else:
|
|
255
|
+
logger.debug(f"Adding new block {block.id} (group {block.group_id}) with hash {block.hash}")
|
|
194
256
|
self._hash_to_id[block.hash] = block.id
|
|
195
257
|
|
|
196
258
|
# Update loop variables
|
|
197
259
|
parent_hash = block.hash
|
|
198
260
|
|
|
199
|
-
def compute_hash(self, parent_hash: int | None, tokens: list[int]) -> int:
|
|
200
|
-
"""Computes the hash of a block
|
|
201
|
-
parent, the parent hash is None."""
|
|
202
|
-
return hash((parent_hash, tuple(tokens)))
|
|
261
|
+
def compute_hash(self, parent_hash: int | None, tokens: list[int], group_id: int) -> int:
|
|
262
|
+
"""Computes the hash of a block identified by the (tokens) it contains, its (parent_hash) and the layer
|
|
263
|
+
(group_id) it belong to. If the block has no parent, the parent hash is None."""
|
|
264
|
+
return hash((parent_hash, tuple(tokens), group_id))
|
|
203
265
|
|
|
204
266
|
|
|
205
267
|
class CacheAllocator(ABC):
|
|
@@ -208,6 +270,7 @@ class CacheAllocator(ABC):
|
|
|
208
270
|
|
|
209
271
|
_index: int
|
|
210
272
|
block_table: dict[str, list[int]] # request_id -> list of block_ids allocated to the request
|
|
273
|
+
uses_block_sharing: bool # flag to determine if the blocks are shareable
|
|
211
274
|
|
|
212
275
|
@abstractmethod
|
|
213
276
|
def allocate_blocks(self, n_blocks: int, request_id: str, block_manager: BlockManager) -> int | None:
|
|
@@ -218,7 +281,7 @@ class CacheAllocator(ABC):
|
|
|
218
281
|
"""Frees all blocks associated with a (request_id) using the (block_manager)."""
|
|
219
282
|
if request_id in self.block_table:
|
|
220
283
|
blocks_to_free = self.block_table.pop(request_id)
|
|
221
|
-
block_manager.free_blocks(blocks_to_free)
|
|
284
|
+
block_manager.free_blocks(blocks_to_free, shareable=self.uses_block_sharing)
|
|
222
285
|
else:
|
|
223
286
|
logger.warning(
|
|
224
287
|
f"CacheAllocator {self._index} attempted to free blocks for non-existent request_id: {request_id}"
|
|
@@ -236,17 +299,48 @@ class CacheAllocator(ABC):
|
|
|
236
299
|
def get_seqlens_k(self, request_id: str, past_length: int, query_length: int) -> tuple[str, int]:
|
|
237
300
|
"""Returns the attention type of the cache allocator and the key sequence length for the given request_id."""
|
|
238
301
|
|
|
302
|
+
def fork_blocks(
|
|
303
|
+
self, parent_request_id: str, children_request_ids: list[str], block_manager: BlockManager
|
|
304
|
+
) -> tuple[list[int], list[int]]:
|
|
305
|
+
"""Forks the cache blocks of a (parent_request_id) to a list of (children_request_ids). To manage the blocks,
|
|
306
|
+
the (block_manager) is used. When forking, the child's block are either shared with the parent, or they need to
|
|
307
|
+
be copied from the parent. Hence we return two lists of blocks that need to be copied: one for the source and
|
|
308
|
+
one for the destination."""
|
|
309
|
+
|
|
310
|
+
# Sanity checks
|
|
311
|
+
if parent_request_id not in self.block_table:
|
|
312
|
+
raise ValueError(f"No block table found for request {parent_request_id}")
|
|
313
|
+
|
|
314
|
+
# Actual forking
|
|
315
|
+
parent_blocks = self.block_table[parent_request_id]
|
|
316
|
+
list_forked_blocks, copy_src, copy_dst = block_manager.fork_blocks(
|
|
317
|
+
parent_blocks=parent_blocks,
|
|
318
|
+
num_forks=len(children_request_ids),
|
|
319
|
+
shareable=self.uses_block_sharing,
|
|
320
|
+
group_id=self._index,
|
|
321
|
+
)
|
|
322
|
+
if list_forked_blocks is None:
|
|
323
|
+
raise ValueError(f"Failed to fork blocks for request {parent_request_id}")
|
|
324
|
+
|
|
325
|
+
# Update the block table for all children requests
|
|
326
|
+
for children_request_id, forked_blocks in zip(children_request_ids, list_forked_blocks):
|
|
327
|
+
if children_request_id in self.block_table:
|
|
328
|
+
raise ValueError(f"Block table already exists for request {children_request_id}")
|
|
329
|
+
self.block_table[children_request_id] = forked_blocks
|
|
330
|
+
return copy_src, copy_dst
|
|
331
|
+
|
|
239
332
|
|
|
240
333
|
class FullAttentionCacheAllocator(CacheAllocator):
|
|
241
334
|
"""Cache manager for a group of full attention layers."""
|
|
242
335
|
|
|
243
|
-
def __init__(self, index: int, block_size: int) -> None:
|
|
336
|
+
def __init__(self, index: int, block_size: int, allow_block_sharing: bool) -> None:
|
|
244
337
|
"""Initializes the cache manager for a group of full attention layers.
|
|
245
338
|
Args:
|
|
246
339
|
- index: the index of the associated layer group
|
|
247
340
|
- block_size: the size of the blocks in the cache
|
|
248
341
|
"""
|
|
249
342
|
self._index = index
|
|
343
|
+
self.uses_block_sharing = allow_block_sharing
|
|
250
344
|
self.block_size = block_size
|
|
251
345
|
self.block_table = {}
|
|
252
346
|
|
|
@@ -261,7 +355,7 @@ class FullAttentionCacheAllocator(CacheAllocator):
|
|
|
261
355
|
else:
|
|
262
356
|
last_block_id = self.block_table[request_id][-1]
|
|
263
357
|
# Actual allocation, return early if failed
|
|
264
|
-
allocated_blocks = block_manager.get_free_blocks(n_blocks, last_block_id)
|
|
358
|
+
allocated_blocks = block_manager.get_free_blocks(n_blocks, last_block_id, self.uses_block_sharing, self._index)
|
|
265
359
|
if allocated_blocks is None:
|
|
266
360
|
return None
|
|
267
361
|
self.block_table[request_id].extend(allocated_blocks)
|
|
@@ -315,6 +409,7 @@ class SlidingAttentionCacheAllocator(CacheAllocator):
|
|
|
315
409
|
- sliding_window: the size of the sliding window
|
|
316
410
|
"""
|
|
317
411
|
self._index = index
|
|
412
|
+
self.uses_block_sharing = False
|
|
318
413
|
self.block_size = block_size
|
|
319
414
|
self.sliding_window = sliding_window
|
|
320
415
|
self._max_blocks_per_request = ceil(self.sliding_window / self.block_size)
|
|
@@ -334,7 +429,9 @@ class SlidingAttentionCacheAllocator(CacheAllocator):
|
|
|
334
429
|
after_allocation = min(already_allocated + n_blocks, self._max_blocks_per_request)
|
|
335
430
|
actual_n_blocks = after_allocation - already_allocated
|
|
336
431
|
# Classic allocation
|
|
337
|
-
allocated_blocks = block_manager.get_free_blocks(
|
|
432
|
+
allocated_blocks = block_manager.get_free_blocks(
|
|
433
|
+
actual_n_blocks, None, self.uses_block_sharing, self._index
|
|
434
|
+
) # no block sharing w/ sliding window
|
|
338
435
|
if allocated_blocks is None:
|
|
339
436
|
return None
|
|
340
437
|
self.block_table[request_id].extend(allocated_blocks)
|