transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -11,7 +11,6 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
from collections import defaultdict
|
|
15
14
|
from typing import TYPE_CHECKING
|
|
16
15
|
|
|
17
16
|
from .base import HfQuantizer
|
|
@@ -38,43 +37,20 @@ if is_torch_available():
|
|
|
38
37
|
import torch
|
|
39
38
|
|
|
40
39
|
from ..core_model_loading import WeightConverter
|
|
41
|
-
from ..pytorch_utils import Conv1D
|
|
42
40
|
|
|
43
41
|
logger = logging.get_logger(__name__)
|
|
44
42
|
|
|
45
43
|
|
|
46
44
|
class Bnb4BitHfQuantizer(HfQuantizer):
|
|
47
45
|
"""
|
|
48
|
-
4-bit quantization from bitsandbytes quantization method
|
|
49
|
-
before loading: converts transformer layers into Linear4bit during loading: load 16bit weight and pass to the
|
|
50
|
-
layer object after: quantizes individual weights in Linear4bit into 4bit at the first .cuda() call
|
|
51
|
-
saving:
|
|
52
|
-
from state dict, as usual; saves weights and `quant_state` components
|
|
53
|
-
loading:
|
|
54
|
-
need to locate `quant_state` components and pass to Param4bit constructor
|
|
46
|
+
4-bit quantization from bitsandbytes quantization method
|
|
55
47
|
"""
|
|
56
48
|
|
|
57
|
-
use_keep_in_fp32_modules = True
|
|
58
|
-
requires_parameters_quantization = True
|
|
59
49
|
requires_calibration = False
|
|
60
50
|
|
|
61
|
-
required_packages = ["bitsandbytes", "accelerate"]
|
|
62
|
-
|
|
63
51
|
def __init__(self, quantization_config, **kwargs):
|
|
64
52
|
super().__init__(quantization_config, **kwargs)
|
|
65
53
|
|
|
66
|
-
if self.quantization_config.llm_int8_skip_modules is not None:
|
|
67
|
-
self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
|
|
68
|
-
|
|
69
|
-
# This describes the additional items that are saved on the state dict (on the params themselves)
|
|
70
|
-
self.bnb_keys = [
|
|
71
|
-
f"quant_state.bitsandbytes__{self.quantization_config.bnb_4bit_quant_type}",
|
|
72
|
-
"absmax",
|
|
73
|
-
"quant_map",
|
|
74
|
-
]
|
|
75
|
-
if self.quantization_config.bnb_4bit_use_double_quant:
|
|
76
|
-
self.bnb_keys.extend(["nested_absmax", "nested_quant_map"])
|
|
77
|
-
|
|
78
54
|
def validate_environment(self, *args, **kwargs):
|
|
79
55
|
if not is_accelerate_available():
|
|
80
56
|
raise ImportError(
|
|
@@ -90,17 +66,9 @@ class Bnb4BitHfQuantizer(HfQuantizer):
|
|
|
90
66
|
validate_bnb_backend_availability(raise_exception=True)
|
|
91
67
|
|
|
92
68
|
device_map = kwargs.get("device_map")
|
|
93
|
-
if (
|
|
94
|
-
|
|
95
|
-
and
|
|
96
|
-
and not self.quantization_config.llm_int8_enable_fp32_cpu_offload
|
|
97
|
-
):
|
|
98
|
-
device_map_without_lm_head = {
|
|
99
|
-
key: device_map[key] for key in device_map if key not in self.modules_to_not_convert
|
|
100
|
-
}
|
|
101
|
-
if set(device_map.values()) == {"cpu"}:
|
|
102
|
-
pass
|
|
103
|
-
elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
|
|
69
|
+
if not self.quantization_config.llm_int8_enable_fp32_cpu_offload and isinstance(device_map, dict):
|
|
70
|
+
values = set(device_map.values())
|
|
71
|
+
if values != {"cpu"} and ("cpu" in values or "disk" in values):
|
|
104
72
|
raise ValueError(
|
|
105
73
|
"Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
|
|
106
74
|
"quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
|
|
@@ -110,115 +78,25 @@ class Bnb4BitHfQuantizer(HfQuantizer):
|
|
|
110
78
|
"for more details. "
|
|
111
79
|
)
|
|
112
80
|
|
|
113
|
-
def
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
return CustomDtype.INT4
|
|
81
|
+
def param_element_size(self, model: "PreTrainedModel", param_name: str, param: "torch.Tensor") -> float:
|
|
82
|
+
"Return the element size (in bytes) for `param_name`."
|
|
83
|
+
if self.param_needs_quantization(model, param_name):
|
|
84
|
+
# 4 bit
|
|
85
|
+
return 0.5
|
|
119
86
|
|
|
120
|
-
|
|
121
|
-
return [k for k in unexpected_keys if not any(k.endswith(x) for x in self.bnb_keys)]
|
|
87
|
+
return super().param_element_size(model, param_name, param)
|
|
122
88
|
|
|
123
89
|
def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
|
|
124
90
|
import bitsandbytes as bnb
|
|
125
91
|
|
|
126
|
-
# They are on the params themselves, so we cannot easily extract the module from the name
|
|
127
|
-
if any(param_name.endswith(x) for x in self.bnb_keys):
|
|
128
|
-
return True
|
|
129
92
|
module, name = get_module_from_name(model, param_name)
|
|
130
93
|
return isinstance(module, bnb.nn.Linear4bit) and name != "bias"
|
|
131
94
|
|
|
132
|
-
def get_param_name(self, param_name: str) -> str:
|
|
133
|
-
"""
|
|
134
|
-
Get the right param_name in order to get the module associated with the param.
|
|
135
|
-
This is useful for quantized stats lile absmax or quant_map as we need to update the param_name to get the module as they are stored in ...weight.absmax.
|
|
136
|
-
"""
|
|
137
|
-
if self.pre_quantized:
|
|
138
|
-
# We need to get the param name of quantized weights and not its components. Otherwise, we won't be able to get the nn.Module associated.
|
|
139
|
-
if any(param_name.endswith(x) for x in self.bnb_keys):
|
|
140
|
-
param_name = (
|
|
141
|
-
param_name.rsplit(".", 1)[0] if "quant_state." not in param_name else param_name.rsplit(".", 2)[0]
|
|
142
|
-
)
|
|
143
|
-
return param_name
|
|
144
|
-
|
|
145
|
-
def create_quantized_param(
|
|
146
|
-
self,
|
|
147
|
-
model: "PreTrainedModel",
|
|
148
|
-
param_value: "torch.Tensor",
|
|
149
|
-
param_name: str,
|
|
150
|
-
target_device: "torch.device",
|
|
151
|
-
**kwargs,
|
|
152
|
-
):
|
|
153
|
-
import bitsandbytes as bnb
|
|
154
|
-
|
|
155
|
-
full_name = param_name
|
|
156
|
-
|
|
157
|
-
# update param name to get the weights instead of the quantized stats
|
|
158
|
-
param_name = self.get_param_name(param_name)
|
|
159
|
-
module, tensor_name = get_module_from_name(model, param_name)
|
|
160
|
-
|
|
161
|
-
# `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
|
|
162
|
-
if isinstance(target_device, int) and is_torch_npu_available():
|
|
163
|
-
target_device = f"npu:{target_device}"
|
|
164
|
-
|
|
165
|
-
# construct `new_value` for the module._parameters[tensor_name]
|
|
166
|
-
if self.pre_quantized:
|
|
167
|
-
module_name = param_name.rsplit(".", 1)[0]
|
|
168
|
-
# Save the states for later quantization when they are all gathered
|
|
169
|
-
if not hasattr(self, "param_quant_stats"):
|
|
170
|
-
self.param_quant_stats = defaultdict(dict)
|
|
171
|
-
self.param_quant_stats[module_name].update({full_name: param_value})
|
|
172
|
-
|
|
173
|
-
# We are ready for quantization in this case (note, the +1 is for the weight itself)
|
|
174
|
-
if len(self.param_quant_stats[module_name]) == len(self.bnb_keys) + 1:
|
|
175
|
-
weight = self.param_quant_stats[module_name].pop(f"{module_name}.weight")
|
|
176
|
-
new_value = bnb.nn.Params4bit.from_prequantized(
|
|
177
|
-
data=weight,
|
|
178
|
-
quantized_stats=self.param_quant_stats[module_name],
|
|
179
|
-
requires_grad=False,
|
|
180
|
-
device=target_device,
|
|
181
|
-
module=module,
|
|
182
|
-
)
|
|
183
|
-
# Set it
|
|
184
|
-
module._parameters[tensor_name] = new_value
|
|
185
|
-
# Delete the states
|
|
186
|
-
del self.param_quant_stats[module_name]
|
|
187
|
-
else:
|
|
188
|
-
new_value = param_value.to("cpu")
|
|
189
|
-
old_value = getattr(module, tensor_name)
|
|
190
|
-
|
|
191
|
-
# Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
|
|
192
|
-
# Since weights are saved in the correct "orientation", we skip transposing when loading.
|
|
193
|
-
if issubclass(module.source_cls, Conv1D):
|
|
194
|
-
new_value = new_value.T
|
|
195
|
-
|
|
196
|
-
kwargs = old_value.__dict__
|
|
197
|
-
kwargs.pop("_is_hf_initialized", None)
|
|
198
|
-
new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device)
|
|
199
|
-
|
|
200
|
-
module._parameters[tensor_name] = new_value
|
|
201
|
-
|
|
202
|
-
# Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer.adjust_max_memory
|
|
203
95
|
def adjust_max_memory(self, max_memory: dict[str, int | str]) -> dict[str, int | str]:
|
|
204
96
|
# need more space for buffers that are created during quantization
|
|
205
97
|
max_memory = {key: val * 0.90 for key, val in max_memory.items()}
|
|
206
98
|
return max_memory
|
|
207
99
|
|
|
208
|
-
# Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer.update_dtype
|
|
209
|
-
def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
|
|
210
|
-
if dtype is None:
|
|
211
|
-
# We force the `dtype` to be float16, this is a requirement from `bitsandbytes`
|
|
212
|
-
logger.info(
|
|
213
|
-
"Overriding dtype=%s with `dtype=torch.float16` due to "
|
|
214
|
-
"requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. "
|
|
215
|
-
"Pass your own dtype to specify the dtype of the remaining non-linear layers or pass"
|
|
216
|
-
" dtype=torch.float16 to remove this warning.",
|
|
217
|
-
dtype,
|
|
218
|
-
)
|
|
219
|
-
dtype = torch.float16
|
|
220
|
-
return dtype
|
|
221
|
-
|
|
222
100
|
def update_device_map(self, device_map):
|
|
223
101
|
if device_map is None:
|
|
224
102
|
if torch.cuda.is_available():
|
|
@@ -238,33 +116,23 @@ class Bnb4BitHfQuantizer(HfQuantizer):
|
|
|
238
116
|
)
|
|
239
117
|
return device_map
|
|
240
118
|
|
|
241
|
-
# Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer._process_model_before_weight_loading
|
|
242
119
|
def _process_model_before_weight_loading(
|
|
243
120
|
self,
|
|
244
121
|
model: "PreTrainedModel",
|
|
245
122
|
device_map,
|
|
246
|
-
keep_in_fp32_modules: list[str] | None = None,
|
|
247
123
|
**kwargs,
|
|
248
124
|
):
|
|
249
125
|
from ..integrations import replace_with_bnb_linear
|
|
250
126
|
|
|
251
|
-
llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload
|
|
252
|
-
|
|
253
127
|
self.modules_to_not_convert = self.get_modules_to_not_convert(
|
|
254
|
-
model, self.quantization_config.llm_int8_skip_modules,
|
|
128
|
+
model, self.quantization_config.llm_int8_skip_modules, model._keep_in_fp32_modules
|
|
255
129
|
)
|
|
256
130
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
131
|
+
if self.quantization_config.llm_int8_enable_fp32_cpu_offload:
|
|
132
|
+
if isinstance(device_map, dict):
|
|
133
|
+
keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
|
|
134
|
+
self.modules_to_not_convert.extend(keys_on_cpu)
|
|
260
135
|
|
|
261
|
-
if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload:
|
|
262
|
-
raise ValueError(
|
|
263
|
-
"If you want to offload some keys to `cpu` or `disk`, you need to set "
|
|
264
|
-
"`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be "
|
|
265
|
-
" converted to 8-bit but kept in 32-bit."
|
|
266
|
-
)
|
|
267
|
-
self.modules_to_not_convert.extend(keys_on_cpu)
|
|
268
136
|
model = replace_with_bnb_linear(
|
|
269
137
|
model,
|
|
270
138
|
modules_to_not_convert=self.modules_to_not_convert,
|
|
@@ -272,27 +140,22 @@ class Bnb4BitHfQuantizer(HfQuantizer):
|
|
|
272
140
|
pre_quantized=self.pre_quantized,
|
|
273
141
|
)
|
|
274
142
|
|
|
275
|
-
model.config.quantization_config = self.quantization_config
|
|
276
|
-
|
|
277
|
-
# Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer._process_model_after_weight_loading with 8bit->4bit
|
|
278
143
|
def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
|
|
279
144
|
model.is_loaded_in_4bit = True
|
|
280
145
|
model.is_4bit_serializable = self.is_serializable()
|
|
281
146
|
return model
|
|
282
147
|
|
|
283
|
-
def is_serializable(self
|
|
148
|
+
def is_serializable(self):
|
|
284
149
|
return True
|
|
285
150
|
|
|
286
151
|
@property
|
|
287
152
|
def is_trainable(self) -> bool:
|
|
288
153
|
return True
|
|
289
154
|
|
|
290
|
-
def _dequantize(self, model):
|
|
155
|
+
def _dequantize(self, model, dtype=None):
|
|
291
156
|
from ..integrations import dequantize_and_replace
|
|
292
157
|
|
|
293
|
-
model = dequantize_and_replace(
|
|
294
|
-
model, self.modules_to_not_convert, quantization_config=self.quantization_config
|
|
295
|
-
)
|
|
158
|
+
model = dequantize_and_replace(model, quantization_config=self.quantization_config, dtype=dtype)
|
|
296
159
|
return model
|
|
297
160
|
|
|
298
161
|
def get_quantize_ops(self):
|
|
@@ -25,6 +25,8 @@ from ..utils import (
|
|
|
25
25
|
is_accelerate_available,
|
|
26
26
|
is_bitsandbytes_available,
|
|
27
27
|
is_torch_available,
|
|
28
|
+
is_torch_hpu_available,
|
|
29
|
+
is_torch_npu_available,
|
|
28
30
|
is_torch_xpu_available,
|
|
29
31
|
logging,
|
|
30
32
|
)
|
|
@@ -35,34 +37,20 @@ if is_torch_available():
|
|
|
35
37
|
import torch
|
|
36
38
|
|
|
37
39
|
from ..core_model_loading import WeightConverter
|
|
38
|
-
from ..pytorch_utils import Conv1D
|
|
39
40
|
|
|
40
41
|
logger = logging.get_logger(__name__)
|
|
41
42
|
|
|
42
43
|
|
|
43
44
|
class Bnb8BitHfQuantizer(HfQuantizer):
|
|
44
45
|
"""
|
|
45
|
-
8-bit quantization from bitsandbytes quantization method
|
|
46
|
-
before loading: converts transformer layers into Linear8bitLt during loading: load 16bit weight and pass to the
|
|
47
|
-
layer object after: quantizes individual weights in Linear8bitLt into 8bit at fitst .cuda() call
|
|
48
|
-
saving:
|
|
49
|
-
from state dict, as usual; saves weights and 'SCB' component
|
|
50
|
-
loading:
|
|
51
|
-
need to locate SCB component and pass to the Linear8bitLt object
|
|
46
|
+
8-bit quantization from bitsandbytes quantization method
|
|
52
47
|
"""
|
|
53
48
|
|
|
54
|
-
use_keep_in_fp32_modules = True
|
|
55
|
-
requires_parameters_quantization = True
|
|
56
49
|
requires_calibration = False
|
|
57
50
|
|
|
58
|
-
required_packages = ["bitsandbytes", "accelerate"]
|
|
59
|
-
|
|
60
51
|
def __init__(self, quantization_config, **kwargs):
|
|
61
52
|
super().__init__(quantization_config, **kwargs)
|
|
62
53
|
|
|
63
|
-
if self.quantization_config.llm_int8_skip_modules is not None:
|
|
64
|
-
self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
|
|
65
|
-
|
|
66
54
|
def validate_environment(self, *args, **kwargs):
|
|
67
55
|
if not is_accelerate_available():
|
|
68
56
|
raise ImportError(
|
|
@@ -78,17 +66,9 @@ class Bnb8BitHfQuantizer(HfQuantizer):
|
|
|
78
66
|
validate_bnb_backend_availability(raise_exception=True)
|
|
79
67
|
|
|
80
68
|
device_map = kwargs.get("device_map")
|
|
81
|
-
if (
|
|
82
|
-
|
|
83
|
-
and
|
|
84
|
-
and not self.quantization_config.llm_int8_enable_fp32_cpu_offload
|
|
85
|
-
):
|
|
86
|
-
device_map_without_lm_head = {
|
|
87
|
-
key: device_map[key] for key in device_map if key not in self.modules_to_not_convert
|
|
88
|
-
}
|
|
89
|
-
if set(device_map.values()) == {"cpu"}:
|
|
90
|
-
pass
|
|
91
|
-
elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
|
|
69
|
+
if not self.quantization_config.llm_int8_enable_fp32_cpu_offload and isinstance(device_map, dict):
|
|
70
|
+
values = set(device_map.values())
|
|
71
|
+
if values != {"cpu"} and ("cpu" in values or "disk" in values):
|
|
92
72
|
raise ValueError(
|
|
93
73
|
"Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
|
|
94
74
|
"quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
|
|
@@ -103,23 +83,14 @@ class Bnb8BitHfQuantizer(HfQuantizer):
|
|
|
103
83
|
max_memory = {key: val * 0.90 for key, val in max_memory.items()}
|
|
104
84
|
return max_memory
|
|
105
85
|
|
|
106
|
-
def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
|
|
107
|
-
if dtype is None:
|
|
108
|
-
# We force the `dtype` to be float16, this is a requirement from `bitsandbytes`
|
|
109
|
-
logger.info(
|
|
110
|
-
"Overriding dtype=%s with `dtype=torch.float16` due to "
|
|
111
|
-
"requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. "
|
|
112
|
-
"Pass your own dtype to specify the dtype of the remaining non-linear layers or pass"
|
|
113
|
-
" dtype=torch.float16 to remove this warning.",
|
|
114
|
-
dtype,
|
|
115
|
-
)
|
|
116
|
-
dtype = torch.float16
|
|
117
|
-
return dtype
|
|
118
|
-
|
|
119
86
|
def update_device_map(self, device_map):
|
|
120
87
|
if device_map is None:
|
|
121
88
|
if torch.cuda.is_available():
|
|
122
89
|
device_map = {"": torch.cuda.current_device()}
|
|
90
|
+
elif is_torch_npu_available():
|
|
91
|
+
device_map = {"": f"npu:{torch.npu.current_device()}"}
|
|
92
|
+
elif is_torch_hpu_available():
|
|
93
|
+
device_map = {"": f"hpu:{torch.hpu.current_device()}"}
|
|
123
94
|
elif is_torch_xpu_available():
|
|
124
95
|
device_map = {"": torch.xpu.current_device()}
|
|
125
96
|
else:
|
|
@@ -131,14 +102,12 @@ class Bnb8BitHfQuantizer(HfQuantizer):
|
|
|
131
102
|
)
|
|
132
103
|
return device_map
|
|
133
104
|
|
|
134
|
-
def
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
bnb_keys = ["SCB", "weight_format"]
|
|
141
|
-
return [k for k in unexpected_keys if not any(k.endswith(x) for x in bnb_keys)]
|
|
105
|
+
def param_element_size(self, model: "PreTrainedModel", param_name: str, param: "torch.Tensor") -> float:
|
|
106
|
+
"Return the element size (in bytes) for `param_name`."
|
|
107
|
+
if self.param_needs_quantization(model, param_name):
|
|
108
|
+
# 8-bit
|
|
109
|
+
return 1
|
|
110
|
+
return super().param_element_size(model, param_name, param)
|
|
142
111
|
|
|
143
112
|
def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
|
|
144
113
|
import bitsandbytes as bnb
|
|
@@ -146,47 +115,6 @@ class Bnb8BitHfQuantizer(HfQuantizer):
|
|
|
146
115
|
module, name = get_module_from_name(model, param_name)
|
|
147
116
|
return isinstance(module, bnb.nn.Linear8bitLt) and name != "bias"
|
|
148
117
|
|
|
149
|
-
def create_quantized_param(
|
|
150
|
-
self,
|
|
151
|
-
model: "PreTrainedModel",
|
|
152
|
-
param_value: "torch.Tensor",
|
|
153
|
-
param_name: str,
|
|
154
|
-
target_device: "torch.device",
|
|
155
|
-
**kwargs,
|
|
156
|
-
):
|
|
157
|
-
import bitsandbytes as bnb
|
|
158
|
-
|
|
159
|
-
module, tensor_name = get_module_from_name(model, param_name)
|
|
160
|
-
|
|
161
|
-
if self.pre_quantized and not self.is_serializable():
|
|
162
|
-
raise ValueError(
|
|
163
|
-
"Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. "
|
|
164
|
-
"Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
|
|
165
|
-
)
|
|
166
|
-
# Those 2 can only happen when self.pre_quantized == True
|
|
167
|
-
if tensor_name == "SCB":
|
|
168
|
-
setattr(module.weight, "SCB", param_value.to(target_device))
|
|
169
|
-
return
|
|
170
|
-
# It's not used, but it's getting serialized for BC reason...
|
|
171
|
-
elif tensor_name == "weight_format":
|
|
172
|
-
return
|
|
173
|
-
|
|
174
|
-
# Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
|
|
175
|
-
# Since weights are saved in the correct "orientation", we skip transposing when loading.
|
|
176
|
-
if issubclass(module.source_cls, Conv1D) and not self.pre_quantized:
|
|
177
|
-
param_value = param_value.T
|
|
178
|
-
|
|
179
|
-
old_value = getattr(module, tensor_name)
|
|
180
|
-
kwargs = old_value.__dict__
|
|
181
|
-
kwargs.pop("_is_hf_initialized", None)
|
|
182
|
-
# Need to pop SCB and reset it because of bnb internals that modifies its value when switching devices ...
|
|
183
|
-
SCB = kwargs.pop("SCB", None)
|
|
184
|
-
new_value = bnb.nn.Int8Params(param_value.to("cpu"), requires_grad=False, **kwargs).to(target_device)
|
|
185
|
-
if SCB is not None:
|
|
186
|
-
setattr(new_value, "SCB", SCB)
|
|
187
|
-
# Set it to the module
|
|
188
|
-
module._parameters[tensor_name] = new_value
|
|
189
|
-
|
|
190
118
|
def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
|
|
191
119
|
model.is_loaded_in_8bit = True
|
|
192
120
|
model.is_8bit_serializable = self.is_serializable()
|
|
@@ -196,28 +124,18 @@ class Bnb8BitHfQuantizer(HfQuantizer):
|
|
|
196
124
|
self,
|
|
197
125
|
model: "PreTrainedModel",
|
|
198
126
|
device_map,
|
|
199
|
-
keep_in_fp32_modules: list[str] | None = None,
|
|
200
127
|
**kwargs,
|
|
201
128
|
):
|
|
202
129
|
from ..integrations import replace_with_bnb_linear
|
|
203
130
|
|
|
204
|
-
llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload
|
|
205
|
-
|
|
206
131
|
self.modules_to_not_convert = self.get_modules_to_not_convert(
|
|
207
|
-
model, self.quantization_config.llm_int8_skip_modules,
|
|
132
|
+
model, self.quantization_config.llm_int8_skip_modules, model._keep_in_fp32_modules
|
|
208
133
|
)
|
|
209
134
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload:
|
|
215
|
-
raise ValueError(
|
|
216
|
-
"If you want to offload some keys to `cpu` or `disk`, you need to set "
|
|
217
|
-
"`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be "
|
|
218
|
-
" converted to 8-bit but kept in 32-bit."
|
|
219
|
-
)
|
|
220
|
-
self.modules_to_not_convert.extend(keys_on_cpu)
|
|
135
|
+
if self.quantization_config.llm_int8_enable_fp32_cpu_offload:
|
|
136
|
+
if isinstance(device_map, dict):
|
|
137
|
+
keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
|
|
138
|
+
self.modules_to_not_convert.extend(keys_on_cpu)
|
|
221
139
|
|
|
222
140
|
model = replace_with_bnb_linear(
|
|
223
141
|
model,
|
|
@@ -226,21 +144,17 @@ class Bnb8BitHfQuantizer(HfQuantizer):
|
|
|
226
144
|
pre_quantized=self.pre_quantized,
|
|
227
145
|
)
|
|
228
146
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
def is_serializable(self, safe_serialization=None):
|
|
147
|
+
def is_serializable(self):
|
|
232
148
|
return True
|
|
233
149
|
|
|
234
150
|
@property
|
|
235
151
|
def is_trainable(self) -> bool:
|
|
236
152
|
return True
|
|
237
153
|
|
|
238
|
-
def _dequantize(self, model):
|
|
154
|
+
def _dequantize(self, model, dtype=None):
|
|
239
155
|
from ..integrations import dequantize_and_replace
|
|
240
156
|
|
|
241
|
-
model = dequantize_and_replace(
|
|
242
|
-
model, self.modules_to_not_convert, quantization_config=self.quantization_config
|
|
243
|
-
)
|
|
157
|
+
model = dequantize_and_replace(model, quantization_config=self.quantization_config, dtype=dtype)
|
|
244
158
|
return model
|
|
245
159
|
|
|
246
160
|
def get_quantize_ops(self):
|
|
@@ -31,7 +31,6 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
|
|
|
31
31
|
"""
|
|
32
32
|
|
|
33
33
|
requires_calibration = True
|
|
34
|
-
required_packages = ["compressed_tensors"]
|
|
35
34
|
|
|
36
35
|
def __init__(self, quantization_config: CompressedTensorsConfig, **kwargs):
|
|
37
36
|
super().__init__(quantization_config, **kwargs)
|
|
@@ -58,15 +57,9 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
|
|
|
58
57
|
"Using `compressed_tensors` quantized models requires the compressed-tensors library: "
|
|
59
58
|
"`pip install compressed-tensors`"
|
|
60
59
|
)
|
|
61
|
-
if not is_torch_available():
|
|
62
|
-
# torch already should be installed as part of compressed tensors
|
|
63
|
-
raise ImportError("torch is required for using compressed-tensors quantization")
|
|
64
60
|
|
|
65
61
|
def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
|
|
66
|
-
if dtype
|
|
67
|
-
logger.info("Loading model using torch.float16 for compressed-tensors quantization")
|
|
68
|
-
dtype = torch.float16
|
|
69
|
-
elif dtype != torch.float16:
|
|
62
|
+
if dtype != torch.float16:
|
|
70
63
|
logger.info("We suggest you to set `dtype=torch.float16` for better efficiency with compressed_tensors.")
|
|
71
64
|
return dtype
|
|
72
65
|
|
|
@@ -113,6 +106,6 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
|
|
|
113
106
|
# models need to be decompressed carry out qat
|
|
114
107
|
return not self.run_compressed or not self.quantization_config.is_quantization_compressed
|
|
115
108
|
|
|
116
|
-
def is_serializable(self
|
|
109
|
+
def is_serializable(self) -> bool:
|
|
117
110
|
"""Models quantized using compressed tensors can be saved to disk"""
|
|
118
111
|
return True
|
|
@@ -19,7 +19,7 @@ from .base import HfQuantizer
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
from ..modeling_utils import PreTrainedModel
|
|
21
21
|
|
|
22
|
-
from ..utils import is_accelerate_available,
|
|
22
|
+
from ..utils import is_accelerate_available, is_kernels_available, is_torch_available, logging
|
|
23
23
|
from .quantizers_utils import get_module_from_name
|
|
24
24
|
|
|
25
25
|
|
|
@@ -32,40 +32,17 @@ logger = logging.get_logger(__name__)
|
|
|
32
32
|
|
|
33
33
|
class EetqHfQuantizer(HfQuantizer):
|
|
34
34
|
"""
|
|
35
|
-
8-bit quantization from EETQ quantization method
|
|
36
|
-
before loading: converts transformer layers into W8A16Linear during loading: load 16bit weight and pass to the
|
|
37
|
-
layer object after: quantizes individual weights in Linear8bitLt into 8bit at first .cuda() call
|
|
35
|
+
8-bit quantization from EETQ quantization method
|
|
38
36
|
"""
|
|
39
37
|
|
|
40
|
-
requires_parameters_quantization = True
|
|
41
38
|
requires_calibration = False
|
|
42
39
|
|
|
43
|
-
required_packages = ["eetq", "accelerate"]
|
|
44
|
-
|
|
45
40
|
def __init__(self, quantization_config, **kwargs):
|
|
46
41
|
super().__init__(quantization_config, **kwargs)
|
|
47
|
-
self.quantization_config = quantization_config
|
|
48
42
|
|
|
49
43
|
def validate_environment(self, *args, **kwargs):
|
|
50
|
-
if not
|
|
51
|
-
raise ImportError(
|
|
52
|
-
"Using `eetq` 8-bit quantization requires eetq."
|
|
53
|
-
"Please install the latest version of eetq from : https://github.com/NetEase-FuXi/EETQ"
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
try:
|
|
57
|
-
import eetq # noqa: F401
|
|
58
|
-
except ImportError as exc:
|
|
59
|
-
if "shard_checkpoint" in str(exc):
|
|
60
|
-
# EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed
|
|
61
|
-
# shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34.
|
|
62
|
-
# TODO: Update message once eetq releases a fix
|
|
63
|
-
raise ImportError(
|
|
64
|
-
"You are using a version of EETQ that is incompatible with the current transformers version. "
|
|
65
|
-
"Either downgrade transformers to <= v4.46.3 or, if available, upgrade EETQ to > v1.0.0."
|
|
66
|
-
) from exc
|
|
67
|
-
else:
|
|
68
|
-
raise
|
|
44
|
+
if not is_kernels_available():
|
|
45
|
+
raise ImportError("Loading an EETQ quantized model requires kernels (`pip install kernels`)")
|
|
69
46
|
|
|
70
47
|
if not is_accelerate_available():
|
|
71
48
|
raise ImportError("Loading an EETQ quantized model requires accelerate (`pip install accelerate`)")
|
|
@@ -79,29 +56,20 @@ class EetqHfQuantizer(HfQuantizer):
|
|
|
79
56
|
"You have loaded an EETQ model on CPU and have a CUDA device available, make sure to set "
|
|
80
57
|
"your model on a GPU device in order to run your model."
|
|
81
58
|
)
|
|
82
|
-
elif device_map
|
|
83
|
-
if
|
|
59
|
+
elif isinstance(device_map, dict):
|
|
60
|
+
if len(device_map) > 1 and "cpu" in device_map.values() or "disk" in device_map.values():
|
|
84
61
|
raise ValueError(
|
|
85
62
|
"You are attempting to load an EETQ model with a device_map that contains a CPU or disk device."
|
|
86
63
|
" This is not supported. Please remove the CPU or disk device from the device_map."
|
|
87
64
|
)
|
|
88
65
|
|
|
89
66
|
def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
|
|
90
|
-
if dtype
|
|
91
|
-
dtype = torch.float16
|
|
92
|
-
logger.info(
|
|
93
|
-
"Overriding dtype=%s with `dtype=torch.float16` due to "
|
|
94
|
-
"requirements of `eetq` to enable model loading in 8-bit. "
|
|
95
|
-
"Pass your own dtype to specify the dtype of the remaining non-linear layers or pass"
|
|
96
|
-
" dtype=torch.float16 to remove this warning.",
|
|
97
|
-
dtype,
|
|
98
|
-
)
|
|
99
|
-
elif dtype != torch.float16:
|
|
67
|
+
if dtype != torch.float16:
|
|
100
68
|
logger.info("We suggest you to set `dtype=torch.float16` for better efficiency with EETQ.")
|
|
101
69
|
return dtype
|
|
102
70
|
|
|
103
71
|
def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
|
|
104
|
-
from eetq import EetqLinear
|
|
72
|
+
from ..integrations.eetq import EetqLinear
|
|
105
73
|
|
|
106
74
|
module, tensor_name = get_module_from_name(model, param_name)
|
|
107
75
|
|
|
@@ -112,55 +80,29 @@ class EetqHfQuantizer(HfQuantizer):
|
|
|
112
80
|
return True
|
|
113
81
|
return False
|
|
114
82
|
|
|
115
|
-
def create_quantized_param(
|
|
116
|
-
self,
|
|
117
|
-
model: "PreTrainedModel",
|
|
118
|
-
param_value: "torch.Tensor",
|
|
119
|
-
param_name: str,
|
|
120
|
-
target_device: "torch.device",
|
|
121
|
-
**kwargs,
|
|
122
|
-
):
|
|
123
|
-
from eetq import EetqLinear, quantize_and_preprocess_weights
|
|
124
|
-
|
|
125
|
-
module, tensor_name = get_module_from_name(model, param_name)
|
|
126
|
-
new_value, weight_scale = quantize_and_preprocess_weights(param_value)
|
|
127
|
-
|
|
128
|
-
# Samity check
|
|
129
|
-
if isinstance(module, EetqLinear):
|
|
130
|
-
if self.pre_quantized or tensor_name == "bias":
|
|
131
|
-
if tensor_name == "weight" and param_value.dtype != torch.int8:
|
|
132
|
-
raise ValueError("Expect quantized weights but got an unquantized weight")
|
|
133
|
-
else:
|
|
134
|
-
if tensor_name == "weight_scale":
|
|
135
|
-
raise ValueError("Expect unquantized weights but got a quantized weight_scale")
|
|
136
|
-
|
|
137
|
-
module._buffers[tensor_name] = new_value.to(target_device)
|
|
138
|
-
module.register("weight_scales", weight_scale.to(target_device))
|
|
139
|
-
|
|
140
83
|
def _process_model_before_weight_loading(
|
|
141
84
|
self,
|
|
142
85
|
model: "PreTrainedModel",
|
|
143
|
-
keep_in_fp32_modules: list[str] | None = None,
|
|
144
86
|
**kwargs,
|
|
145
87
|
):
|
|
146
88
|
from ..integrations import replace_with_eetq_linear
|
|
147
89
|
|
|
148
90
|
self.modules_to_not_convert = self.get_modules_to_not_convert(
|
|
149
|
-
model, self.quantization_config.modules_to_not_convert,
|
|
91
|
+
model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules
|
|
150
92
|
)
|
|
151
93
|
|
|
152
94
|
model = replace_with_eetq_linear(
|
|
153
|
-
model,
|
|
154
|
-
modules_to_not_convert=self.modules_to_not_convert,
|
|
155
|
-
quantization_config=self.quantization_config,
|
|
156
|
-
pre_quantized=self.pre_quantized,
|
|
95
|
+
model, modules_to_not_convert=self.modules_to_not_convert, pre_quantized=self.pre_quantized
|
|
157
96
|
)
|
|
158
97
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def is_serializable(self, safe_serialization=None):
|
|
98
|
+
def is_serializable(self):
|
|
162
99
|
return True
|
|
163
100
|
|
|
164
101
|
@property
|
|
165
102
|
def is_trainable(self) -> bool:
|
|
166
103
|
return True
|
|
104
|
+
|
|
105
|
+
def get_quantize_ops(self):
|
|
106
|
+
from ..integrations.eetq import EetqQuantize
|
|
107
|
+
|
|
108
|
+
return EetqQuantize(self)
|