transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +30 -3
- transformers/cli/serve.py +47 -17
- transformers/conversion_mapping.py +15 -2
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +196 -135
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +1 -2
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +1 -2
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/configuration_utils.py +3 -2
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/continuous_api.py +134 -79
- transformers/image_processing_base.py +1 -2
- transformers/integrations/__init__.py +4 -2
- transformers/integrations/accelerate.py +15 -3
- transformers/integrations/aqlm.py +38 -66
- transformers/integrations/awq.py +48 -514
- transformers/integrations/bitnet.py +45 -100
- transformers/integrations/bitsandbytes.py +79 -191
- transformers/integrations/deepspeed.py +1 -0
- transformers/integrations/eetq.py +84 -79
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +236 -193
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +40 -62
- transformers/integrations/hub_kernels.py +42 -3
- transformers/integrations/integration_utils.py +10 -0
- transformers/integrations/mxfp4.py +25 -65
- transformers/integrations/peft.py +7 -29
- transformers/integrations/quanto.py +73 -55
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +44 -90
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +42 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +8 -0
- transformers/modeling_rope_utils.py +30 -6
- transformers/modeling_utils.py +116 -112
- transformers/models/__init__.py +3 -0
- transformers/models/afmoe/modeling_afmoe.py +4 -4
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +2 -0
- transformers/models/altclip/modeling_altclip.py +4 -0
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/modeling_aria.py +4 -4
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/auto/configuration_auto.py +11 -0
- transformers/models/auto/feature_extraction_auto.py +2 -0
- transformers/models/auto/image_processing_auto.py +1 -0
- transformers/models/auto/modeling_auto.py +6 -0
- transformers/models/auto/processing_auto.py +18 -10
- transformers/models/auto/tokenization_auto.py +74 -472
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/bamba/modeling_bamba.py +4 -3
- transformers/models/bark/modeling_bark.py +2 -0
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/big_bird/modeling_big_bird.py +6 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +11 -2
- transformers/models/bitnet/modeling_bitnet.py +4 -4
- transformers/models/blenderbot/modeling_blenderbot.py +5 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
- transformers/models/blip/modeling_blip_text.py +2 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -1
- transformers/models/bloom/modeling_bloom.py +4 -0
- transformers/models/blt/modeling_blt.py +2 -2
- transformers/models/blt/modular_blt.py +2 -2
- transformers/models/bridgetower/modeling_bridgetower.py +5 -1
- transformers/models/bros/modeling_bros.py +4 -0
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +5 -0
- transformers/models/chameleon/modeling_chameleon.py +2 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
- transformers/models/clap/modeling_clap.py +5 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +5 -0
- transformers/models/clvp/modeling_clvp.py +5 -0
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +4 -3
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +7 -6
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
- transformers/models/convbert/modeling_convbert.py +6 -0
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/modeling_csm.py +4 -3
- transformers/models/ctrl/modeling_ctrl.py +1 -0
- transformers/models/cvt/modeling_cvt.py +2 -0
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/modeling_d_fine.py +2 -0
- transformers/models/d_fine/modular_d_fine.py +1 -0
- transformers/models/dab_detr/modeling_dab_detr.py +4 -0
- transformers/models/dac/modeling_dac.py +2 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/dbrx/modeling_dbrx.py +2 -2
- transformers/models/deberta/modeling_deberta.py +5 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
- transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
- transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
- transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/modeling_detr.py +5 -0
- transformers/models/dia/modeling_dia.py +4 -3
- transformers/models/dia/modular_dia.py +0 -1
- transformers/models/diffllama/modeling_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +2 -3
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +2 -0
- transformers/models/dots1/modeling_dots1.py +10 -7
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/edgetam/modeling_edgetam.py +1 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
- transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
- transformers/models/efficientnet/modeling_efficientnet.py +2 -0
- transformers/models/emu3/modeling_emu3.py +4 -4
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +14 -2
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
- transformers/models/esm/modeling_esmfold.py +5 -4
- transformers/models/evolla/modeling_evolla.py +4 -4
- transformers/models/exaone4/modeling_exaone4.py +2 -2
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +6 -1
- transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
- transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
- transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
- transformers/models/flaubert/modeling_flaubert.py +7 -0
- transformers/models/flava/modeling_flava.py +6 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
- transformers/models/florence2/modeling_florence2.py +2 -1
- transformers/models/florence2/modular_florence2.py +2 -1
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/processing_fuyu.py +3 -3
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +2 -1
- transformers/models/gemma3/modeling_gemma3.py +14 -84
- transformers/models/gemma3/modular_gemma3.py +12 -81
- transformers/models/gemma3n/modeling_gemma3n.py +18 -209
- transformers/models/gemma3n/modular_gemma3n.py +17 -59
- transformers/models/git/modeling_git.py +2 -0
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/modeling_glm4v.py +3 -3
- transformers/models/glm4v/modular_glm4v.py +6 -4
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
- transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/gpt2/modeling_gpt2.py +5 -1
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
- transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
- transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
- transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
- transformers/models/gptj/modeling_gptj.py +3 -0
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granitemoe/modeling_granitemoe.py +4 -6
- transformers/models/granitemoe/modular_granitemoe.py +0 -2
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
- transformers/models/groupvit/modeling_groupvit.py +3 -0
- transformers/models/helium/modeling_helium.py +4 -3
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +3 -0
- transformers/models/hubert/modular_hubert.py +1 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
- transformers/models/ibert/modeling_ibert.py +6 -0
- transformers/models/idefics/modeling_idefics.py +5 -21
- transformers/models/imagegpt/modeling_imagegpt.py +2 -1
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/internvl/modeling_internvl.py +2 -4
- transformers/models/internvl/modular_internvl.py +2 -4
- transformers/models/jamba/modeling_jamba.py +2 -2
- transformers/models/janus/modeling_janus.py +1 -0
- transformers/models/janus/modular_janus.py +1 -0
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/kosmos2/modeling_kosmos2.py +1 -0
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +244 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +729 -0
- transformers/models/lasr/modular_lasr.py +569 -0
- transformers/models/lasr/processing_lasr.py +96 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +5 -0
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +3 -0
- transformers/models/lfm2/modeling_lfm2.py +4 -5
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +4 -0
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/modeling_llama4.py +3 -2
- transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
- transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -0
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +4 -0
- transformers/models/mamba/modeling_mamba.py +14 -22
- transformers/models/marian/modeling_marian.py +5 -0
- transformers/models/markuplm/modeling_markuplm.py +4 -0
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/modeling_mask2former.py +2 -0
- transformers/models/maskformer/modeling_maskformer.py +2 -0
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +3 -1
- transformers/models/minimax/modeling_minimax.py +4 -4
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +4 -3
- transformers/models/mistral/modeling_mistral.py +4 -3
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mllama/modeling_mllama.py +2 -2
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/modeling_mobilevit.py +3 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
- transformers/models/modernbert/modeling_modernbert.py +4 -1
- transformers/models/modernbert/modular_modernbert.py +2 -0
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
- transformers/models/moonshine/modeling_moonshine.py +4 -2
- transformers/models/moshi/modeling_moshi.py +5 -2
- transformers/models/mpnet/modeling_mpnet.py +5 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +6 -0
- transformers/models/mt5/modeling_mt5.py +7 -0
- transformers/models/musicgen/modeling_musicgen.py +2 -0
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nemotron/modeling_nemotron.py +4 -2
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nougat/tokenization_nougat.py +11 -59
- transformers/models/nystromformer/modeling_nystromformer.py +6 -0
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +4 -5
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
- transformers/models/oneformer/modeling_oneformer.py +4 -1
- transformers/models/openai/modeling_openai.py +3 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/owlv2/modeling_owlv2.py +4 -0
- transformers/models/owlvit/modeling_owlvit.py +4 -0
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +9 -6
- transformers/models/parakeet/modular_parakeet.py +2 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
- transformers/models/patchtst/modeling_patchtst.py +20 -2
- transformers/models/pegasus/modeling_pegasus.py +5 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
- transformers/models/perceiver/modeling_perceiver.py +8 -0
- transformers/models/persimmon/modeling_persimmon.py +2 -1
- transformers/models/phi/modeling_phi.py +4 -5
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +2 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
- transformers/models/phimoe/modeling_phimoe.py +4 -4
- transformers/models/phimoe/modular_phimoe.py +2 -2
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pixtral/modeling_pixtral.py +2 -1
- transformers/models/plbart/modeling_plbart.py +6 -0
- transformers/models/plbart/modular_plbart.py +2 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/modeling_poolformer.py +2 -0
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +3 -0
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
- transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
- transformers/models/rag/modeling_rag.py +1 -0
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
- transformers/models/reformer/modeling_reformer.py +4 -0
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +6 -1
- transformers/models/rembert/modeling_rembert.py +6 -0
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +11 -2
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/modeling_rt_detr.py +2 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
- transformers/models/rwkv/modeling_rwkv.py +1 -0
- transformers/models/sam2/modeling_sam2.py +2 -2
- transformers/models/sam2/modular_sam2.py +2 -2
- transformers/models/sam2_video/modeling_sam2_video.py +1 -0
- transformers/models/sam2_video/modular_sam2_video.py +1 -0
- transformers/models/sam3/modeling_sam3.py +77 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
- transformers/models/sam3_video/modeling_sam3_video.py +1 -0
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
- transformers/models/seed_oss/modeling_seed_oss.py +2 -2
- transformers/models/segformer/modeling_segformer.py +4 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/siglip2/modeling_siglip2.py +4 -0
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
- transformers/models/speecht5/modeling_speecht5.py +13 -1
- transformers/models/splinter/modeling_splinter.py +3 -0
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +6 -0
- transformers/models/stablelm/modeling_stablelm.py +3 -1
- transformers/models/starcoder2/modeling_starcoder2.py +4 -3
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +2 -0
- transformers/models/swin/modeling_swin.py +4 -0
- transformers/models/swin2sr/modeling_swin2sr.py +2 -0
- transformers/models/swinv2/modeling_swinv2.py +4 -0
- transformers/models/t5/modeling_t5.py +7 -0
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +5 -5
- transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
- transformers/models/table_transformer/modeling_table_transformer.py +4 -0
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +2 -0
- transformers/models/timesfm/modular_timesfm.py +2 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
- transformers/models/trocr/modeling_trocr.py +2 -0
- transformers/models/tvp/modeling_tvp.py +2 -0
- transformers/models/udop/modeling_udop.py +4 -0
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/modeling_umt5.py +7 -0
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/vilt/modeling_vilt.py +6 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +6 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/modeling_vitmatte.py +1 -0
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/modeling_whisper.py +6 -0
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +3 -0
- transformers/models/xglm/modeling_xglm.py +1 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +5 -0
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/yoso/modeling_yoso.py +6 -0
- transformers/models/zamba/modeling_zamba.py +2 -0
- transformers/models/zamba2/modeling_zamba2.py +4 -2
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/modeling_zoedepth.py +1 -0
- transformers/pipelines/__init__.py +2 -3
- transformers/pipelines/base.py +1 -9
- transformers/pipelines/document_question_answering.py +3 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/processing_utils.py +23 -11
- transformers/quantizers/base.py +35 -110
- transformers/quantizers/quantizer_aqlm.py +1 -5
- transformers/quantizers/quantizer_auto_round.py +1 -2
- transformers/quantizers/quantizer_awq.py +17 -81
- transformers/quantizers/quantizer_bitnet.py +3 -8
- transformers/quantizers/quantizer_bnb_4bit.py +13 -110
- transformers/quantizers/quantizer_bnb_8bit.py +16 -92
- transformers/quantizers/quantizer_compressed_tensors.py +1 -5
- transformers/quantizers/quantizer_eetq.py +14 -62
- transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
- transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
- transformers/quantizers/quantizer_fp_quant.py +48 -78
- transformers/quantizers/quantizer_gptq.py +7 -24
- transformers/quantizers/quantizer_higgs.py +40 -54
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +13 -167
- transformers/quantizers/quantizer_quanto.py +20 -64
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +1 -4
- transformers/quantizers/quantizer_torchao.py +23 -202
- transformers/quantizers/quantizer_vptq.py +8 -22
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +297 -36
- transformers/tokenization_mistral_common.py +4 -0
- transformers/tokenization_utils_base.py +113 -222
- transformers/tokenization_utils_tokenizers.py +168 -107
- transformers/trainer.py +28 -31
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +66 -28
- transformers/utils/__init__.py +3 -4
- transformers/utils/auto_docstring.py +1 -0
- transformers/utils/generic.py +27 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +61 -16
- transformers/utils/kernel_config.py +4 -2
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +75 -242
- transformers/video_processing_utils.py +1 -2
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -28,16 +28,13 @@ from typing import Any, Optional, Union
|
|
|
28
28
|
from packaging import version
|
|
29
29
|
|
|
30
30
|
from ..utils import (
|
|
31
|
-
is_auto_awq_available,
|
|
32
31
|
is_compressed_tensors_available,
|
|
33
|
-
is_gptqmodel_available,
|
|
34
32
|
is_hqq_available,
|
|
35
33
|
is_quark_available,
|
|
36
34
|
is_torch_available,
|
|
37
35
|
is_torchao_available,
|
|
38
36
|
logging,
|
|
39
37
|
)
|
|
40
|
-
from .import_utils import is_auto_gptq_available
|
|
41
38
|
|
|
42
39
|
|
|
43
40
|
if is_torch_available():
|
|
@@ -68,30 +65,26 @@ class QuantizationMethod(str, Enum):
|
|
|
68
65
|
MXFP4 = "mxfp4"
|
|
69
66
|
|
|
70
67
|
|
|
71
|
-
class
|
|
68
|
+
class AwqFormat(str, Enum):
|
|
72
69
|
GEMM = "gemm"
|
|
73
70
|
GEMV = "gemv"
|
|
74
|
-
|
|
75
|
-
IPEX = "ipex"
|
|
71
|
+
GEMV_FAST = "gemv_fast"
|
|
76
72
|
|
|
77
|
-
@staticmethod
|
|
78
|
-
def from_str(version: str):
|
|
79
|
-
version = version.lower()
|
|
80
|
-
if version == "gemm":
|
|
81
|
-
return AWQLinearVersion.GEMM
|
|
82
|
-
elif version == "gemv":
|
|
83
|
-
return AWQLinearVersion.GEMV
|
|
84
|
-
elif version == "exllama":
|
|
85
|
-
return AWQLinearVersion.EXLLAMA
|
|
86
|
-
elif version == "ipex":
|
|
87
|
-
return AWQLinearVersion.IPEX
|
|
88
|
-
else:
|
|
89
|
-
raise ValueError(f"Unknown AWQLinearVersion {version}")
|
|
90
73
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
74
|
+
class AwqBackend(str, Enum):
|
|
75
|
+
LEGACY_AWQ = "autoawq"
|
|
76
|
+
AUTO = "auto"
|
|
77
|
+
AUTO_TRAINABLE = "auto_trainable"
|
|
78
|
+
MACHETE = "machete"
|
|
79
|
+
MARLIN = "marlin"
|
|
80
|
+
EXLLAMA_V2 = "exllama_v2"
|
|
81
|
+
EXLLAMA_V1 = "exllama_v1"
|
|
82
|
+
GEMM = "gemm"
|
|
83
|
+
GEMM_TRITON = "gemm_triton"
|
|
84
|
+
GEMV = "gemv"
|
|
85
|
+
GEMV_FAST = "gemv_fast"
|
|
86
|
+
TORCH_AWQ = "torch_awq"
|
|
87
|
+
TORCH_FUSED_AWQ = "torch_fused_awq"
|
|
95
88
|
|
|
96
89
|
|
|
97
90
|
@dataclass
|
|
@@ -620,7 +613,7 @@ class ExllamaVersion(int, Enum):
|
|
|
620
613
|
class GPTQConfig(QuantizationConfigMixin):
|
|
621
614
|
"""
|
|
622
615
|
This is a wrapper class about all possible attributes and features that you can play with a model that has been
|
|
623
|
-
loaded using `optimum` api for
|
|
616
|
+
loaded using `optimum` api for GPTQ quantization relying on the gptqmodel backend.
|
|
624
617
|
|
|
625
618
|
Args:
|
|
626
619
|
bits (`int`):
|
|
@@ -641,22 +634,23 @@ class GPTQConfig(QuantizationConfigMixin):
|
|
|
641
634
|
desc_act (`bool`, *optional*, defaults to `False`):
|
|
642
635
|
Whether to quantize columns in order of decreasing activation size. Setting it to False can significantly
|
|
643
636
|
speed up inference but the perplexity may become slightly worse. Also known as act-order.
|
|
637
|
+
act_group_aware (`bool`, *optional*, defaults to `True`):
|
|
638
|
+
Use GAR (group aware activation order) during quantization. Has measurable positive impact on quantization
|
|
639
|
+
quality. Only applicable when `desc_act = False`. Will forced to be `False` when `desc_act = True`.
|
|
644
640
|
sym (`bool`, *optional*, defaults to `True`):
|
|
645
641
|
Whether to use symmetric quantization.
|
|
646
642
|
true_sequential (`bool`, *optional*, defaults to `True`):
|
|
647
643
|
Whether to perform sequential quantization even within a single Transformer block. Instead of quantizing
|
|
648
644
|
the entire block at once, we perform layer-wise quantization. As a result, each layer undergoes
|
|
649
645
|
quantization using inputs that have passed through the previously quantized layers.
|
|
650
|
-
|
|
651
|
-
GPTQ weight format. `gptq`(v1) is supported by
|
|
646
|
+
format (`str`, *optional*, defaults to `"gptq"`):
|
|
647
|
+
GPTQ weight format. `gptq` (v1) is supported by gptqmodel. `gptq_v2` is gptqmodel only.
|
|
652
648
|
meta (`dict[str, any]`, *optional*):
|
|
653
649
|
Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
|
|
654
650
|
i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
|
|
655
651
|
backend (`str`, *optional*):
|
|
656
|
-
Controls which
|
|
657
|
-
|
|
658
|
-
use_cuda_fp16 (`bool`, *optional*, defaults to `False`):
|
|
659
|
-
Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16. Auto-gptq only.
|
|
652
|
+
Controls which kernel to use. Valid values for gptqmodel are `auto`, `auto_trainable` and more. Ref gptqmodel backends:
|
|
653
|
+
https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
|
|
660
654
|
model_seqlen (`int`, *optional*):
|
|
661
655
|
The maximum sequence length that the model can take.
|
|
662
656
|
block_name_to_quantize (`str`, *optional*):
|
|
@@ -667,14 +661,9 @@ class GPTQConfig(QuantizationConfigMixin):
|
|
|
667
661
|
The batch size used when processing the dataset
|
|
668
662
|
pad_token_id (`int`, *optional*):
|
|
669
663
|
The pad token id. Needed to prepare the dataset when `batch_size` > 1.
|
|
670
|
-
use_exllama (`bool`, *optional*):
|
|
671
|
-
Whether to use exllama backend. Defaults to `True` if unset. Only works with `bits` = 4.
|
|
672
664
|
max_input_length (`int`, *optional*):
|
|
673
665
|
The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input
|
|
674
666
|
length. It is specific to the exllama backend with act-order.
|
|
675
|
-
exllama_config (`dict[str, Any]`, *optional*):
|
|
676
|
-
The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults
|
|
677
|
-
to `{"version": 1}` if unset.
|
|
678
667
|
cache_block_outputs (`bool`, *optional*, defaults to `True`):
|
|
679
668
|
Whether to cache block outputs to reuse as inputs for the succeeding block.
|
|
680
669
|
modules_in_block_to_quantize (`list[list[str]]`, *optional*):
|
|
@@ -694,20 +683,18 @@ class GPTQConfig(QuantizationConfigMixin):
|
|
|
694
683
|
group_size: int = 128,
|
|
695
684
|
damp_percent: float = 0.1,
|
|
696
685
|
desc_act: bool = False,
|
|
686
|
+
act_group_aware: bool = True,
|
|
697
687
|
sym: bool = True,
|
|
698
688
|
true_sequential: bool = True,
|
|
699
|
-
|
|
700
|
-
meta: dict[str, Any]
|
|
701
|
-
backend: str
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
module_name_preceding_first_block: list[str] | None = None,
|
|
689
|
+
format: str = "gptq",
|
|
690
|
+
meta: Optional[dict[str, Any]] = None,
|
|
691
|
+
backend: Optional[str] = None,
|
|
692
|
+
model_seqlen: Optional[int] = None,
|
|
693
|
+
block_name_to_quantize: Optional[str] = None,
|
|
694
|
+
module_name_preceding_first_block: Optional[list[str]] = None,
|
|
706
695
|
batch_size: int = 1,
|
|
707
|
-
pad_token_id: int
|
|
708
|
-
|
|
709
|
-
max_input_length: int | None = None,
|
|
710
|
-
exllama_config: dict[str, Any] | None = None,
|
|
696
|
+
pad_token_id: Optional[int] = None,
|
|
697
|
+
max_input_length: Optional[int] = None,
|
|
711
698
|
cache_block_outputs: bool = True,
|
|
712
699
|
modules_in_block_to_quantize: list[list[str]] | None = None,
|
|
713
700
|
**kwargs,
|
|
@@ -719,33 +706,28 @@ class GPTQConfig(QuantizationConfigMixin):
|
|
|
719
706
|
self.group_size = group_size
|
|
720
707
|
self.damp_percent = damp_percent
|
|
721
708
|
self.desc_act = desc_act
|
|
709
|
+
self.act_group_aware = act_group_aware
|
|
722
710
|
self.sym = sym
|
|
723
711
|
self.true_sequential = true_sequential
|
|
724
|
-
self.
|
|
712
|
+
self.format = format.lower()
|
|
713
|
+
# Compatible with legacy field: checkpoint_format
|
|
714
|
+
if kwargs.get("checkpoint_format") is not None:
|
|
715
|
+
self.format = kwargs.pop("checkpoint_format").lower()
|
|
725
716
|
self.meta = meta
|
|
726
717
|
self.backend = backend.lower() if isinstance(backend, str) else backend
|
|
727
|
-
self.use_cuda_fp16 = use_cuda_fp16
|
|
728
718
|
self.model_seqlen = model_seqlen
|
|
729
719
|
self.block_name_to_quantize = block_name_to_quantize
|
|
730
720
|
self.module_name_preceding_first_block = module_name_preceding_first_block
|
|
731
721
|
self.batch_size = batch_size
|
|
732
722
|
self.pad_token_id = pad_token_id
|
|
733
|
-
self.use_exllama = use_exllama
|
|
734
723
|
self.max_input_length = max_input_length
|
|
735
|
-
self.exllama_config = exllama_config
|
|
736
724
|
self.cache_block_outputs = cache_block_outputs
|
|
737
725
|
self.modules_in_block_to_quantize = modules_in_block_to_quantize
|
|
738
726
|
self.post_init()
|
|
739
727
|
|
|
740
728
|
def get_loading_attributes(self):
|
|
741
729
|
attributes_dict = copy.deepcopy(self.__dict__)
|
|
742
|
-
loading_attributes = [
|
|
743
|
-
"use_exllama",
|
|
744
|
-
"exllama_config",
|
|
745
|
-
"use_cuda_fp16",
|
|
746
|
-
"max_input_length",
|
|
747
|
-
"backend",
|
|
748
|
-
]
|
|
730
|
+
loading_attributes = ["max_input_length", "backend"]
|
|
749
731
|
loading_attributes_dict = {i: j for i, j in attributes_dict.items() if i in loading_attributes}
|
|
750
732
|
return loading_attributes_dict
|
|
751
733
|
|
|
@@ -772,46 +754,14 @@ class GPTQConfig(QuantizationConfigMixin):
|
|
|
772
754
|
['wikitext2','c4','c4-new'], but we found {self.dataset}"""
|
|
773
755
|
)
|
|
774
756
|
|
|
775
|
-
#
|
|
776
|
-
if
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
self.backend = "auto_trainable" if self.use_exllama is not None and not self.use_exllama else "auto"
|
|
780
|
-
else:
|
|
781
|
-
# convert gptqmodel backend `auto_trainable` into auto-gptq control
|
|
782
|
-
if self.backend == "auto_trainable":
|
|
783
|
-
self.use_exllama = False
|
|
784
|
-
|
|
785
|
-
# auto-gptq specific kernel control logic
|
|
786
|
-
if self.use_exllama is None:
|
|
787
|
-
# New default behaviour
|
|
788
|
-
self.use_exllama = True
|
|
757
|
+
# act_group_order is only applicable when `desc_act = False`
|
|
758
|
+
if self.desc_act and self.act_group_aware:
|
|
759
|
+
self.act_group_aware = False
|
|
760
|
+
logger.warning("`act_group_aware` has been auto-disabled as it is not compatible with `desc_act = True`.")
|
|
789
761
|
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
if "version" not in self.exllama_config:
|
|
794
|
-
raise ValueError("`exllama_config` needs to have a `version` key.")
|
|
795
|
-
elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
|
|
796
|
-
exllama_version = self.exllama_config["version"]
|
|
797
|
-
raise ValueError(
|
|
798
|
-
f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {exllama_version}"
|
|
799
|
-
)
|
|
800
|
-
|
|
801
|
-
if self.bits == 4 and self.use_exllama:
|
|
802
|
-
if self.exllama_config["version"] == ExllamaVersion.ONE:
|
|
803
|
-
logger.info(
|
|
804
|
-
"You have activated exllama backend. Note that you can get better inference "
|
|
805
|
-
"speed using exllamav2 kernel by setting `exllama_config`."
|
|
806
|
-
)
|
|
807
|
-
elif self.exllama_config["version"] == ExllamaVersion.TWO:
|
|
808
|
-
if is_auto_gptq_available():
|
|
809
|
-
optimum_version = version.parse(importlib.metadata.version("optimum"))
|
|
810
|
-
autogptq_version = version.parse(importlib.metadata.version("auto_gptq"))
|
|
811
|
-
if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"):
|
|
812
|
-
raise ValueError(
|
|
813
|
-
f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
|
|
814
|
-
)
|
|
762
|
+
# make sure backend default stays consistent with gptqmodel expectations
|
|
763
|
+
if self.backend is None:
|
|
764
|
+
self.backend = "auto"
|
|
815
765
|
if self.modules_in_block_to_quantize is not None:
|
|
816
766
|
optimum_version = version.parse(importlib.metadata.version("optimum"))
|
|
817
767
|
if optimum_version < version.parse("1.15.0"):
|
|
@@ -821,17 +771,15 @@ class GPTQConfig(QuantizationConfigMixin):
|
|
|
821
771
|
|
|
822
772
|
def to_dict(self) -> dict[str, Any]:
|
|
823
773
|
config_dict = super().to_dict()
|
|
824
|
-
|
|
774
|
+
# Compatible with legacy field: checkpoint_format
|
|
775
|
+
config_dict["checkpoint_format"] = self.format
|
|
825
776
|
return config_dict
|
|
826
777
|
|
|
827
778
|
def to_dict_optimum(self):
|
|
828
779
|
"""
|
|
829
780
|
Get compatible dict for optimum gptq config
|
|
830
781
|
"""
|
|
831
|
-
|
|
832
|
-
# make it compatible with optimum config
|
|
833
|
-
quant_dict["disable_exllama"] = not self.use_exllama
|
|
834
|
-
return quant_dict
|
|
782
|
+
return self.to_dict()
|
|
835
783
|
|
|
836
784
|
@classmethod
|
|
837
785
|
def from_dict_optimum(cls, config_dict):
|
|
@@ -839,17 +787,12 @@ class GPTQConfig(QuantizationConfigMixin):
|
|
|
839
787
|
Get compatible class with optimum gptq config dict
|
|
840
788
|
"""
|
|
841
789
|
|
|
842
|
-
if "disable_exllama" in config_dict:
|
|
843
|
-
config_dict["use_exllama"] = not config_dict["disable_exllama"]
|
|
844
|
-
# switch to None to not trigger the warning
|
|
845
|
-
config_dict.pop("disable_exllama")
|
|
846
|
-
|
|
847
790
|
config = cls(**config_dict)
|
|
848
791
|
return config
|
|
849
792
|
|
|
850
793
|
|
|
851
794
|
@dataclass
|
|
852
|
-
class AwqConfig(
|
|
795
|
+
class AwqConfig(GPTQConfig):
|
|
853
796
|
"""
|
|
854
797
|
This is a wrapper class about all possible attributes and features that you can play with a model that has been
|
|
855
798
|
loaded using `auto-awq` library awq quantization relying on auto_awq backend.
|
|
@@ -861,26 +804,12 @@ class AwqConfig(QuantizationConfigMixin):
|
|
|
861
804
|
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
|
|
862
805
|
zero_point (`bool`, *optional*, defaults to `True`):
|
|
863
806
|
Whether to use zero point quantization.
|
|
864
|
-
|
|
865
|
-
The
|
|
866
|
-
GEMV is better (e.g. < 8 ). GEMM models are compatible with Exllama kernels.
|
|
867
|
-
backend (`AwqBackendPackingMethod`, *optional*, defaults to `AwqBackendPackingMethod.AUTOAWQ`):
|
|
868
|
-
The quantization backend. Some models might be quantized using `llm-awq` backend. This is useful for users
|
|
869
|
-
that quantize their own models using `llm-awq` library.
|
|
870
|
-
do_fuse (`bool`, *optional*, defaults to `False`):
|
|
871
|
-
Whether to fuse attention and mlp layers together for faster inference
|
|
872
|
-
fuse_max_seq_len (`int`, *optional*):
|
|
873
|
-
The Maximum sequence length to generate when using fusing.
|
|
874
|
-
modules_to_fuse (`dict`, *optional*, default to `None`):
|
|
875
|
-
Overwrite the natively supported fusing scheme with the one specified by the users.
|
|
807
|
+
backend (`AwqBackend`, *optional*, defaults to `AwqBackend.AUTO`):
|
|
808
|
+
The quantization backend.
|
|
876
809
|
modules_to_not_convert (`list`, *optional*, default to `None`):
|
|
877
810
|
The list of modules to not quantize, useful for quantizing models that explicitly require to have
|
|
878
811
|
some modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers).
|
|
879
812
|
Note you cannot quantize directly with transformers, please refer to `AutoAWQ` documentation for quantizing HF models.
|
|
880
|
-
exllama_config (`dict[str, Any]`, *optional*):
|
|
881
|
-
You can specify the version of the exllama kernel through the `version` key, the maximum sequence
|
|
882
|
-
length through the `max_input_len` key, and the maximum batch size through the `max_batch_size` key.
|
|
883
|
-
Defaults to `{"version": 2, "max_input_len": 2048, "max_batch_size": 8}` if unset.
|
|
884
813
|
"""
|
|
885
814
|
|
|
886
815
|
def __init__(
|
|
@@ -888,141 +817,45 @@ class AwqConfig(QuantizationConfigMixin):
|
|
|
888
817
|
bits: int = 4,
|
|
889
818
|
group_size: int = 128,
|
|
890
819
|
zero_point: bool = True,
|
|
891
|
-
|
|
892
|
-
backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ,
|
|
893
|
-
do_fuse: bool | None = None,
|
|
894
|
-
fuse_max_seq_len: int | None = None,
|
|
895
|
-
modules_to_fuse: dict | None = None,
|
|
820
|
+
backend: AwqBackend = AwqBackend.AUTO,
|
|
896
821
|
modules_to_not_convert: list | None = None,
|
|
897
|
-
exllama_config: dict[str, int] | None = None,
|
|
898
822
|
**kwargs,
|
|
899
823
|
):
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
824
|
+
format = kwargs.pop("format", AwqFormat.GEMM)
|
|
825
|
+
# Compatible with legacy field: version
|
|
826
|
+
if kwargs.get("version") is not None:
|
|
827
|
+
format = kwargs.pop("version").lower()
|
|
828
|
+
# Compatible with legacy backend
|
|
829
|
+
if backend == AwqBackend.LEGACY_AWQ:
|
|
830
|
+
backend = AwqBackend.AUTO
|
|
904
831
|
self.zero_point = zero_point
|
|
905
|
-
self.version = version
|
|
906
|
-
self.backend = backend
|
|
907
|
-
self.fuse_max_seq_len = fuse_max_seq_len
|
|
908
832
|
self.modules_to_not_convert = modules_to_not_convert
|
|
909
|
-
self.exllama_config = exllama_config
|
|
910
|
-
|
|
911
|
-
self.modules_to_fuse = modules_to_fuse
|
|
912
|
-
if do_fuse is None:
|
|
913
|
-
self.do_fuse = modules_to_fuse is not None and len(modules_to_fuse) > 0
|
|
914
|
-
else:
|
|
915
|
-
self.do_fuse = do_fuse
|
|
916
|
-
self.fuse_max_seq_len = fuse_max_seq_len
|
|
917
833
|
|
|
918
|
-
|
|
834
|
+
super().__init__(bits=bits, group_size=group_size, backend=backend, format=format, **kwargs)
|
|
835
|
+
self.quant_method = QuantizationMethod.AWQ
|
|
919
836
|
|
|
920
837
|
def post_init(self):
|
|
921
838
|
r"""
|
|
922
839
|
Safety checker that arguments are correct
|
|
923
840
|
"""
|
|
924
|
-
if self.
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
self.version = AWQLinearVersion.from_str(self.version)
|
|
930
|
-
if self.version not in [
|
|
931
|
-
AWQLinearVersion.GEMM,
|
|
932
|
-
AWQLinearVersion.GEMV,
|
|
933
|
-
AWQLinearVersion.EXLLAMA,
|
|
934
|
-
AWQLinearVersion.IPEX,
|
|
841
|
+
if self.format not in [
|
|
842
|
+
AwqFormat.GEMM,
|
|
843
|
+
AwqFormat.GEMV,
|
|
844
|
+
AwqFormat.GEMV_FAST,
|
|
935
845
|
]:
|
|
936
846
|
raise ValueError(
|
|
937
|
-
f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.
|
|
847
|
+
f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.GEMV_FAST] - not recognized version {self.format}"
|
|
938
848
|
)
|
|
939
849
|
|
|
940
|
-
if self.backend
|
|
941
|
-
|
|
942
|
-
if not (torch.cuda.is_available() or torch.xpu.is_available()):
|
|
943
|
-
raise ValueError("LLM-AWQ backend is only supported on CUDA and XPU")
|
|
944
|
-
if torch.cuda.is_available():
|
|
945
|
-
compute_capability = torch.cuda.get_device_capability()
|
|
946
|
-
major, minor = compute_capability
|
|
947
|
-
if major < 8:
|
|
948
|
-
raise ValueError("LLM-AWQ backend is only supported on CUDA GPUs with compute capability >= 8.0")
|
|
949
|
-
|
|
950
|
-
if self.do_fuse and self.fuse_max_seq_len is None:
|
|
951
|
-
raise ValueError(
|
|
952
|
-
"You cannot enable fused modules without specifying a `fuse_max_seq_len`, make sure to pass a valid `fuse_max_seq_len` for your usecase"
|
|
953
|
-
)
|
|
850
|
+
if self.backend not in AwqBackend.__members__.values():
|
|
851
|
+
raise ValueError(f"Invalid backend '{self.backend}'. Must be one of: {[b.value for b in AwqBackend]}")
|
|
954
852
|
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
)
|
|
962
|
-
|
|
963
|
-
if not awq_version_supports_fusing:
|
|
964
|
-
raise ValueError(
|
|
965
|
-
f"You current version of `autoawq` does not support module fusing, please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
|
|
966
|
-
)
|
|
967
|
-
|
|
968
|
-
if self.modules_to_not_convert is not None:
|
|
969
|
-
awq_version_supports_non_conversion = False
|
|
970
|
-
MIN_AWQ_VERSION = "0.1.8"
|
|
971
|
-
if is_auto_awq_available():
|
|
972
|
-
awq_version_supports_non_conversion = version.parse(
|
|
973
|
-
importlib.metadata.version("autoawq")
|
|
974
|
-
) >= version.parse(MIN_AWQ_VERSION)
|
|
975
|
-
|
|
976
|
-
if not awq_version_supports_non_conversion:
|
|
977
|
-
raise ValueError(
|
|
978
|
-
f"You current version of `autoawq` does not support module quantization skipping, please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
|
|
979
|
-
)
|
|
980
|
-
|
|
981
|
-
if self.do_fuse and self.modules_to_fuse is not None:
|
|
982
|
-
required_keys = [
|
|
983
|
-
"hidden_size",
|
|
984
|
-
"num_attention_heads",
|
|
985
|
-
"num_key_value_heads",
|
|
986
|
-
"mlp",
|
|
987
|
-
"attention",
|
|
988
|
-
"layernorm",
|
|
989
|
-
"use_alibi",
|
|
990
|
-
]
|
|
991
|
-
if not all(key in self.modules_to_fuse for key in required_keys):
|
|
992
|
-
raise ValueError(
|
|
993
|
-
f"Required fields are missing in the fusing mapping, required fields are {required_keys}"
|
|
994
|
-
)
|
|
995
|
-
|
|
996
|
-
if self.version == AWQLinearVersion.EXLLAMA:
|
|
997
|
-
awq_version_supports_exllama = False
|
|
998
|
-
MIN_AWQ_VERSION = "0.2.0"
|
|
999
|
-
if is_auto_awq_available():
|
|
1000
|
-
awq_version_supports_exllama = version.parse(importlib.metadata.version("autoawq")) >= version.parse(
|
|
1001
|
-
MIN_AWQ_VERSION
|
|
1002
|
-
)
|
|
1003
|
-
|
|
1004
|
-
if not awq_version_supports_exllama:
|
|
1005
|
-
raise ValueError(
|
|
1006
|
-
f"You current version of `autoawq` does not support exllama backend, "
|
|
1007
|
-
f"please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
|
|
1008
|
-
)
|
|
1009
|
-
|
|
1010
|
-
if self.exllama_config is None:
|
|
1011
|
-
self.exllama_config = {"version": ExllamaVersion.TWO, "max_input_len": 2048, "max_batch_size": 8}
|
|
1012
|
-
else:
|
|
1013
|
-
if "version" not in self.exllama_config:
|
|
1014
|
-
raise ValueError("`exllama_config` needs to have a `version` key.")
|
|
1015
|
-
elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
|
|
1016
|
-
exllama_version = self.exllama_config["version"]
|
|
1017
|
-
raise ValueError(
|
|
1018
|
-
f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {exllama_version}"
|
|
1019
|
-
)
|
|
1020
|
-
|
|
1021
|
-
def get_loading_attributes(self):
|
|
1022
|
-
attributes_dict = copy.deepcopy(self.__dict__)
|
|
1023
|
-
loading_attributes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len", "exllama_config"]
|
|
1024
|
-
loading_attributes_dict = {i: j for i, j in attributes_dict.items() if i in loading_attributes}
|
|
1025
|
-
return loading_attributes_dict
|
|
853
|
+
def to_dict(self) -> dict[str, Any]:
|
|
854
|
+
config_dict = super().to_dict()
|
|
855
|
+
config_dict.pop("checkpoint_format")
|
|
856
|
+
# Compatible with legacy field: version
|
|
857
|
+
config_dict["version"] = self.format
|
|
858
|
+
return config_dict
|
|
1026
859
|
|
|
1027
860
|
|
|
1028
861
|
@dataclass
|
|
@@ -22,7 +22,7 @@ from functools import partial
|
|
|
22
22
|
from typing import Any, Optional, Union
|
|
23
23
|
|
|
24
24
|
import numpy as np
|
|
25
|
-
from huggingface_hub import create_repo
|
|
25
|
+
from huggingface_hub import create_repo, is_offline_mode
|
|
26
26
|
from huggingface_hub.dataclasses import validate_typed_dict
|
|
27
27
|
|
|
28
28
|
from .dynamic_module_utils import custom_object_save
|
|
@@ -44,7 +44,6 @@ from .utils import (
|
|
|
44
44
|
TensorType,
|
|
45
45
|
add_start_docstrings,
|
|
46
46
|
copy_func,
|
|
47
|
-
is_offline_mode,
|
|
48
47
|
is_torch_available,
|
|
49
48
|
is_torchcodec_available,
|
|
50
49
|
is_torchvision_v2_available,
|