transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +30 -3
- transformers/cli/serve.py +47 -17
- transformers/conversion_mapping.py +15 -2
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +196 -135
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +1 -2
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +1 -2
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/configuration_utils.py +3 -2
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/continuous_api.py +134 -79
- transformers/image_processing_base.py +1 -2
- transformers/integrations/__init__.py +4 -2
- transformers/integrations/accelerate.py +15 -3
- transformers/integrations/aqlm.py +38 -66
- transformers/integrations/awq.py +48 -514
- transformers/integrations/bitnet.py +45 -100
- transformers/integrations/bitsandbytes.py +79 -191
- transformers/integrations/deepspeed.py +1 -0
- transformers/integrations/eetq.py +84 -79
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +236 -193
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +40 -62
- transformers/integrations/hub_kernels.py +42 -3
- transformers/integrations/integration_utils.py +10 -0
- transformers/integrations/mxfp4.py +25 -65
- transformers/integrations/peft.py +7 -29
- transformers/integrations/quanto.py +73 -55
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +44 -90
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +42 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +8 -0
- transformers/modeling_rope_utils.py +30 -6
- transformers/modeling_utils.py +116 -112
- transformers/models/__init__.py +3 -0
- transformers/models/afmoe/modeling_afmoe.py +4 -4
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +2 -0
- transformers/models/altclip/modeling_altclip.py +4 -0
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/modeling_aria.py +4 -4
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/auto/configuration_auto.py +11 -0
- transformers/models/auto/feature_extraction_auto.py +2 -0
- transformers/models/auto/image_processing_auto.py +1 -0
- transformers/models/auto/modeling_auto.py +6 -0
- transformers/models/auto/processing_auto.py +18 -10
- transformers/models/auto/tokenization_auto.py +74 -472
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/bamba/modeling_bamba.py +4 -3
- transformers/models/bark/modeling_bark.py +2 -0
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/big_bird/modeling_big_bird.py +6 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +11 -2
- transformers/models/bitnet/modeling_bitnet.py +4 -4
- transformers/models/blenderbot/modeling_blenderbot.py +5 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
- transformers/models/blip/modeling_blip_text.py +2 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -1
- transformers/models/bloom/modeling_bloom.py +4 -0
- transformers/models/blt/modeling_blt.py +2 -2
- transformers/models/blt/modular_blt.py +2 -2
- transformers/models/bridgetower/modeling_bridgetower.py +5 -1
- transformers/models/bros/modeling_bros.py +4 -0
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +5 -0
- transformers/models/chameleon/modeling_chameleon.py +2 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
- transformers/models/clap/modeling_clap.py +5 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +5 -0
- transformers/models/clvp/modeling_clvp.py +5 -0
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +4 -3
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +7 -6
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
- transformers/models/convbert/modeling_convbert.py +6 -0
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/modeling_csm.py +4 -3
- transformers/models/ctrl/modeling_ctrl.py +1 -0
- transformers/models/cvt/modeling_cvt.py +2 -0
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/modeling_d_fine.py +2 -0
- transformers/models/d_fine/modular_d_fine.py +1 -0
- transformers/models/dab_detr/modeling_dab_detr.py +4 -0
- transformers/models/dac/modeling_dac.py +2 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/dbrx/modeling_dbrx.py +2 -2
- transformers/models/deberta/modeling_deberta.py +5 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
- transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
- transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
- transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/modeling_detr.py +5 -0
- transformers/models/dia/modeling_dia.py +4 -3
- transformers/models/dia/modular_dia.py +0 -1
- transformers/models/diffllama/modeling_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +2 -3
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +2 -0
- transformers/models/dots1/modeling_dots1.py +10 -7
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/edgetam/modeling_edgetam.py +1 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
- transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
- transformers/models/efficientnet/modeling_efficientnet.py +2 -0
- transformers/models/emu3/modeling_emu3.py +4 -4
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +14 -2
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
- transformers/models/esm/modeling_esmfold.py +5 -4
- transformers/models/evolla/modeling_evolla.py +4 -4
- transformers/models/exaone4/modeling_exaone4.py +2 -2
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +6 -1
- transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
- transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
- transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
- transformers/models/flaubert/modeling_flaubert.py +7 -0
- transformers/models/flava/modeling_flava.py +6 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
- transformers/models/florence2/modeling_florence2.py +2 -1
- transformers/models/florence2/modular_florence2.py +2 -1
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/processing_fuyu.py +3 -3
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +2 -1
- transformers/models/gemma3/modeling_gemma3.py +14 -84
- transformers/models/gemma3/modular_gemma3.py +12 -81
- transformers/models/gemma3n/modeling_gemma3n.py +18 -209
- transformers/models/gemma3n/modular_gemma3n.py +17 -59
- transformers/models/git/modeling_git.py +2 -0
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/modeling_glm4v.py +3 -3
- transformers/models/glm4v/modular_glm4v.py +6 -4
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
- transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/gpt2/modeling_gpt2.py +5 -1
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
- transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
- transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
- transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
- transformers/models/gptj/modeling_gptj.py +3 -0
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granitemoe/modeling_granitemoe.py +4 -6
- transformers/models/granitemoe/modular_granitemoe.py +0 -2
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
- transformers/models/groupvit/modeling_groupvit.py +3 -0
- transformers/models/helium/modeling_helium.py +4 -3
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +3 -0
- transformers/models/hubert/modular_hubert.py +1 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
- transformers/models/ibert/modeling_ibert.py +6 -0
- transformers/models/idefics/modeling_idefics.py +5 -21
- transformers/models/imagegpt/modeling_imagegpt.py +2 -1
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/internvl/modeling_internvl.py +2 -4
- transformers/models/internvl/modular_internvl.py +2 -4
- transformers/models/jamba/modeling_jamba.py +2 -2
- transformers/models/janus/modeling_janus.py +1 -0
- transformers/models/janus/modular_janus.py +1 -0
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/kosmos2/modeling_kosmos2.py +1 -0
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +244 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +729 -0
- transformers/models/lasr/modular_lasr.py +569 -0
- transformers/models/lasr/processing_lasr.py +96 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +5 -0
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +3 -0
- transformers/models/lfm2/modeling_lfm2.py +4 -5
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +4 -0
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/modeling_llama4.py +3 -2
- transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
- transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -0
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +4 -0
- transformers/models/mamba/modeling_mamba.py +14 -22
- transformers/models/marian/modeling_marian.py +5 -0
- transformers/models/markuplm/modeling_markuplm.py +4 -0
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/modeling_mask2former.py +2 -0
- transformers/models/maskformer/modeling_maskformer.py +2 -0
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +3 -1
- transformers/models/minimax/modeling_minimax.py +4 -4
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +4 -3
- transformers/models/mistral/modeling_mistral.py +4 -3
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mllama/modeling_mllama.py +2 -2
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/modeling_mobilevit.py +3 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
- transformers/models/modernbert/modeling_modernbert.py +4 -1
- transformers/models/modernbert/modular_modernbert.py +2 -0
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
- transformers/models/moonshine/modeling_moonshine.py +4 -2
- transformers/models/moshi/modeling_moshi.py +5 -2
- transformers/models/mpnet/modeling_mpnet.py +5 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +6 -0
- transformers/models/mt5/modeling_mt5.py +7 -0
- transformers/models/musicgen/modeling_musicgen.py +2 -0
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nemotron/modeling_nemotron.py +4 -2
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nougat/tokenization_nougat.py +11 -59
- transformers/models/nystromformer/modeling_nystromformer.py +6 -0
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +4 -5
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
- transformers/models/oneformer/modeling_oneformer.py +4 -1
- transformers/models/openai/modeling_openai.py +3 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/owlv2/modeling_owlv2.py +4 -0
- transformers/models/owlvit/modeling_owlvit.py +4 -0
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +9 -6
- transformers/models/parakeet/modular_parakeet.py +2 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
- transformers/models/patchtst/modeling_patchtst.py +20 -2
- transformers/models/pegasus/modeling_pegasus.py +5 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
- transformers/models/perceiver/modeling_perceiver.py +8 -0
- transformers/models/persimmon/modeling_persimmon.py +2 -1
- transformers/models/phi/modeling_phi.py +4 -5
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +2 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
- transformers/models/phimoe/modeling_phimoe.py +4 -4
- transformers/models/phimoe/modular_phimoe.py +2 -2
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pixtral/modeling_pixtral.py +2 -1
- transformers/models/plbart/modeling_plbart.py +6 -0
- transformers/models/plbart/modular_plbart.py +2 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/modeling_poolformer.py +2 -0
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +3 -0
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
- transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
- transformers/models/rag/modeling_rag.py +1 -0
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
- transformers/models/reformer/modeling_reformer.py +4 -0
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +6 -1
- transformers/models/rembert/modeling_rembert.py +6 -0
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +11 -2
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/modeling_rt_detr.py +2 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
- transformers/models/rwkv/modeling_rwkv.py +1 -0
- transformers/models/sam2/modeling_sam2.py +2 -2
- transformers/models/sam2/modular_sam2.py +2 -2
- transformers/models/sam2_video/modeling_sam2_video.py +1 -0
- transformers/models/sam2_video/modular_sam2_video.py +1 -0
- transformers/models/sam3/modeling_sam3.py +77 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
- transformers/models/sam3_video/modeling_sam3_video.py +1 -0
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
- transformers/models/seed_oss/modeling_seed_oss.py +2 -2
- transformers/models/segformer/modeling_segformer.py +4 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/siglip2/modeling_siglip2.py +4 -0
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
- transformers/models/speecht5/modeling_speecht5.py +13 -1
- transformers/models/splinter/modeling_splinter.py +3 -0
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +6 -0
- transformers/models/stablelm/modeling_stablelm.py +3 -1
- transformers/models/starcoder2/modeling_starcoder2.py +4 -3
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +2 -0
- transformers/models/swin/modeling_swin.py +4 -0
- transformers/models/swin2sr/modeling_swin2sr.py +2 -0
- transformers/models/swinv2/modeling_swinv2.py +4 -0
- transformers/models/t5/modeling_t5.py +7 -0
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +5 -5
- transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
- transformers/models/table_transformer/modeling_table_transformer.py +4 -0
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +2 -0
- transformers/models/timesfm/modular_timesfm.py +2 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
- transformers/models/trocr/modeling_trocr.py +2 -0
- transformers/models/tvp/modeling_tvp.py +2 -0
- transformers/models/udop/modeling_udop.py +4 -0
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/modeling_umt5.py +7 -0
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/vilt/modeling_vilt.py +6 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +6 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/modeling_vitmatte.py +1 -0
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/modeling_whisper.py +6 -0
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +3 -0
- transformers/models/xglm/modeling_xglm.py +1 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +5 -0
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/yoso/modeling_yoso.py +6 -0
- transformers/models/zamba/modeling_zamba.py +2 -0
- transformers/models/zamba2/modeling_zamba2.py +4 -2
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/modeling_zoedepth.py +1 -0
- transformers/pipelines/__init__.py +2 -3
- transformers/pipelines/base.py +1 -9
- transformers/pipelines/document_question_answering.py +3 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/processing_utils.py +23 -11
- transformers/quantizers/base.py +35 -110
- transformers/quantizers/quantizer_aqlm.py +1 -5
- transformers/quantizers/quantizer_auto_round.py +1 -2
- transformers/quantizers/quantizer_awq.py +17 -81
- transformers/quantizers/quantizer_bitnet.py +3 -8
- transformers/quantizers/quantizer_bnb_4bit.py +13 -110
- transformers/quantizers/quantizer_bnb_8bit.py +16 -92
- transformers/quantizers/quantizer_compressed_tensors.py +1 -5
- transformers/quantizers/quantizer_eetq.py +14 -62
- transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
- transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
- transformers/quantizers/quantizer_fp_quant.py +48 -78
- transformers/quantizers/quantizer_gptq.py +7 -24
- transformers/quantizers/quantizer_higgs.py +40 -54
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +13 -167
- transformers/quantizers/quantizer_quanto.py +20 -64
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +1 -4
- transformers/quantizers/quantizer_torchao.py +23 -202
- transformers/quantizers/quantizer_vptq.py +8 -22
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +297 -36
- transformers/tokenization_mistral_common.py +4 -0
- transformers/tokenization_utils_base.py +113 -222
- transformers/tokenization_utils_tokenizers.py +168 -107
- transformers/trainer.py +28 -31
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +66 -28
- transformers/utils/__init__.py +3 -4
- transformers/utils/auto_docstring.py +1 -0
- transformers/utils/generic.py +27 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +61 -16
- transformers/utils/kernel_config.py +4 -2
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +75 -242
- transformers/video_processing_utils.py +1 -2
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -20,26 +20,20 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
|
|
|
20
20
|
Supports both e4m3fn formats based on platform.
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
-
requires_parameters_quantization = True
|
|
24
23
|
requires_calibration = False
|
|
25
|
-
required_packages = ["accelerate"]
|
|
26
24
|
|
|
27
25
|
def __init__(self, quantization_config, **kwargs):
|
|
28
26
|
super().__init__(quantization_config, **kwargs)
|
|
29
|
-
self.quantization_config = quantization_config
|
|
30
27
|
|
|
31
28
|
def validate_environment(self, *args, **kwargs):
|
|
32
|
-
if not is_torch_available():
|
|
33
|
-
raise ImportError(
|
|
34
|
-
"Using fp8 quantization requires torch >= 2.1.0"
|
|
35
|
-
"Please install the latest version of torch ( pip install --upgrade torch )"
|
|
36
|
-
)
|
|
37
|
-
|
|
38
29
|
if not is_accelerate_available():
|
|
39
30
|
raise ImportError("Loading an FP8 quantized model requires accelerate (`pip install accelerate`)")
|
|
40
31
|
|
|
41
|
-
if
|
|
42
|
-
|
|
32
|
+
if self.quantization_config.dequantize:
|
|
33
|
+
return
|
|
34
|
+
|
|
35
|
+
if not torch.cuda.is_available() and not is_torch_xpu_available():
|
|
36
|
+
if self.pre_quantized and not self.quantization_config.dequantize:
|
|
43
37
|
logger.warning_once(
|
|
44
38
|
"Using FP8 quantized models requires a GPU or XPU, we will default to dequantizing the model to bf16 since no GPU or XPU is available"
|
|
45
39
|
)
|
|
@@ -64,11 +58,12 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
|
|
|
64
58
|
"your model on a GPU or XPU device in order to run your model. To remove this warning, "
|
|
65
59
|
"pass device_map = 'cuda' or 'xpu'. "
|
|
66
60
|
)
|
|
67
|
-
elif device_map
|
|
61
|
+
elif isinstance(device_map, dict):
|
|
68
62
|
if (
|
|
69
63
|
not self.pre_quantized
|
|
70
|
-
and
|
|
71
|
-
and
|
|
64
|
+
and len(device_map) > 1
|
|
65
|
+
and "cpu" in device_map.values()
|
|
66
|
+
or "disk" in device_map.values()
|
|
72
67
|
):
|
|
73
68
|
raise ValueError(
|
|
74
69
|
"You are attempting to load an FP8 model with a device_map that contains a cpu/disk device."
|
|
@@ -76,76 +71,6 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
|
|
|
76
71
|
"Please use a quantized checkpoint or remove the cpu/disk device from the device_map."
|
|
77
72
|
)
|
|
78
73
|
|
|
79
|
-
def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
|
|
80
|
-
if dtype is None:
|
|
81
|
-
logger.info("Setting dtype to torch.float32 as no dtype was specified in from_pretrained")
|
|
82
|
-
dtype = torch.float32
|
|
83
|
-
return dtype
|
|
84
|
-
|
|
85
|
-
# TODO: make this into a `ConversionType` ops -> potentially requires all weights on all ranks
|
|
86
|
-
# depending on the layer type (moe -> no if ep)
|
|
87
|
-
def create_quantized_param(
|
|
88
|
-
self,
|
|
89
|
-
model: "PreTrainedModel",
|
|
90
|
-
param_value: "torch.Tensor",
|
|
91
|
-
param_name: str,
|
|
92
|
-
target_device: "torch.device",
|
|
93
|
-
**kwargs,
|
|
94
|
-
):
|
|
95
|
-
from ..integrations.finegrained_fp8 import FP8Linear
|
|
96
|
-
from ..modeling_utils import _load_parameter_into_model
|
|
97
|
-
|
|
98
|
-
# Sanity checks
|
|
99
|
-
module, tensor_name = get_module_from_name(model, param_name)
|
|
100
|
-
if isinstance(module, FP8Linear):
|
|
101
|
-
if self.pre_quantized or tensor_name == "bias":
|
|
102
|
-
if tensor_name == "weight" and param_value.dtype != torch.float8_e4m3fn:
|
|
103
|
-
raise ValueError("Expect quantized weights but got an unquantized weight")
|
|
104
|
-
else:
|
|
105
|
-
return
|
|
106
|
-
# if tensor_name == "weight_scale_inv":
|
|
107
|
-
# raise ValueError("Expect unquantized weights but got a quantized weight_scale")
|
|
108
|
-
|
|
109
|
-
param_value = param_value.to(target_device)
|
|
110
|
-
|
|
111
|
-
# Get FP8 min/max values
|
|
112
|
-
fp8_min = torch.finfo(torch.float8_e4m3fn).min
|
|
113
|
-
fp8_max = torch.finfo(torch.float8_e4m3fn).max
|
|
114
|
-
|
|
115
|
-
block_size_m, block_size_n = self.quantization_config.weight_block_size
|
|
116
|
-
|
|
117
|
-
rows, cols = param_value.shape[-2:]
|
|
118
|
-
|
|
119
|
-
if rows % block_size_m != 0 or cols % block_size_n != 0:
|
|
120
|
-
raise ValueError(
|
|
121
|
-
f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_size_m}, {block_size_n})"
|
|
122
|
-
)
|
|
123
|
-
param_value_orig_shape = param_value.shape
|
|
124
|
-
|
|
125
|
-
param_value = param_value.reshape(
|
|
126
|
-
-1, rows // block_size_m, block_size_m, cols // block_size_n, block_size_n
|
|
127
|
-
).permute(0, 1, 3, 2, 4)
|
|
128
|
-
|
|
129
|
-
# Calculate scaling factor for each block
|
|
130
|
-
max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2))
|
|
131
|
-
scale = fp8_max / max_abs
|
|
132
|
-
scale_orig_shape = scale.shape
|
|
133
|
-
scale = scale.unsqueeze(-1).unsqueeze(-1)
|
|
134
|
-
|
|
135
|
-
# Quantize the weights
|
|
136
|
-
quantized_param = torch.clamp(param_value * scale, min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
|
137
|
-
|
|
138
|
-
quantized_param = quantized_param.permute(0, 1, 3, 2, 4)
|
|
139
|
-
# Reshape back to matrix shape
|
|
140
|
-
quantized_param = quantized_param.reshape(param_value_orig_shape)
|
|
141
|
-
|
|
142
|
-
# Reshape scale to match the number of blocks
|
|
143
|
-
scale = scale.reshape(scale_orig_shape).squeeze().reciprocal()
|
|
144
|
-
|
|
145
|
-
# Load into the model
|
|
146
|
-
_load_parameter_into_model(model, param_name, quantized_param)
|
|
147
|
-
_load_parameter_into_model(model, param_name.rsplit(".", 1)[0] + ".weight_scale_inv", scale)
|
|
148
|
-
|
|
149
74
|
def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
|
|
150
75
|
from ..integrations.finegrained_fp8 import FP8Expert, FP8Linear
|
|
151
76
|
|
|
@@ -165,35 +90,17 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
|
|
|
165
90
|
):
|
|
166
91
|
from ..integrations.finegrained_fp8 import replace_with_fp8_linear
|
|
167
92
|
|
|
168
|
-
# takes 2 fucking seconds
|
|
169
93
|
self.modules_to_not_convert = self.get_modules_to_not_convert(
|
|
170
94
|
model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
|
|
171
95
|
)
|
|
172
96
|
|
|
173
|
-
# while this one is 81ms :)
|
|
174
97
|
model = replace_with_fp8_linear(
|
|
175
98
|
model,
|
|
176
99
|
modules_to_not_convert=self.modules_to_not_convert,
|
|
177
100
|
quantization_config=self.quantization_config,
|
|
101
|
+
pre_quantized=self.pre_quantized,
|
|
178
102
|
)
|
|
179
103
|
|
|
180
|
-
model.config.quantization_config = self.quantization_config
|
|
181
|
-
|
|
182
|
-
def update_missing_keys(self, model, missing_keys: list[str], prefix: str) -> list[str]:
|
|
183
|
-
from ..integrations import FP8Linear
|
|
184
|
-
|
|
185
|
-
not_missing_keys = []
|
|
186
|
-
for name, module in model.named_modules():
|
|
187
|
-
if isinstance(module, FP8Linear):
|
|
188
|
-
for missing in missing_keys:
|
|
189
|
-
if (
|
|
190
|
-
(name in missing or name in f"{prefix}.{missing}")
|
|
191
|
-
and not missing.endswith(".weight")
|
|
192
|
-
and not missing.endswith(".bias")
|
|
193
|
-
):
|
|
194
|
-
not_missing_keys.append(missing)
|
|
195
|
-
return [k for k in missing_keys if k not in not_missing_keys]
|
|
196
|
-
|
|
197
104
|
# NOTE: TP is applied before quantization so this is only to add hooks.
|
|
198
105
|
# Quantization is incompatible with DTensors, so we have to anyway have
|
|
199
106
|
# gathers! But it should be model independant -> figure out where to put
|
|
@@ -223,7 +130,7 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
|
|
|
223
130
|
|
|
224
131
|
return config
|
|
225
132
|
|
|
226
|
-
def is_serializable(self
|
|
133
|
+
def is_serializable(self):
|
|
227
134
|
return True
|
|
228
135
|
|
|
229
136
|
@property
|
|
@@ -246,8 +153,9 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
|
|
|
246
153
|
if self.pre_quantized and self.quantization_config.dequantize:
|
|
247
154
|
return [
|
|
248
155
|
# either use the dollar sign, or permute the source patterns to start matching against the scales first
|
|
156
|
+
# We also collect the activation scales, they will not be used
|
|
249
157
|
WeightConverter(
|
|
250
|
-
source_patterns=["weight$", "weight_scale_inv"],
|
|
158
|
+
source_patterns=["weight$", "weight_scale_inv", "activation_scale"],
|
|
251
159
|
target_patterns="weight",
|
|
252
160
|
operations=[Fp8Dequantize(self)],
|
|
253
161
|
)
|
|
@@ -36,13 +36,10 @@ class FPQuantHfQuantizer(HfQuantizer):
|
|
|
36
36
|
"""
|
|
37
37
|
|
|
38
38
|
requires_calibration = False
|
|
39
|
-
requires_parameters_quantization = True
|
|
40
39
|
is_qat_trainable = True
|
|
41
|
-
required_packages = ["fp_quant"]
|
|
42
40
|
|
|
43
41
|
def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
|
|
44
42
|
super().__init__(quantization_config, **kwargs)
|
|
45
|
-
self.quantization_config = quantization_config
|
|
46
43
|
|
|
47
44
|
def validate_environment(self, device_map, **kwargs):
|
|
48
45
|
if not torch.cuda.is_available() and not is_torch_xpu_available():
|
|
@@ -68,15 +65,17 @@ class FPQuantHfQuantizer(HfQuantizer):
|
|
|
68
65
|
"You are attempting to load a FPQuant model without setting device_map."
|
|
69
66
|
" Please set device_map comprised of 'cuda' devices."
|
|
70
67
|
)
|
|
71
|
-
elif (
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
68
|
+
elif isinstance(device_map, dict):
|
|
69
|
+
if (
|
|
70
|
+
not self.quantization_config.pseudoquantization
|
|
71
|
+
and len(device_map) > 1
|
|
72
|
+
and "cpu" in device_map.values()
|
|
73
|
+
or "disk" in device_map.values()
|
|
74
|
+
):
|
|
75
|
+
raise ValueError(
|
|
76
|
+
"You are attempting to load a FPQuant model with a device_map that contains a CPU or disk device."
|
|
77
|
+
" This is not supported. Please remove the CPU or disk device from the device_map."
|
|
78
|
+
)
|
|
80
79
|
|
|
81
80
|
def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
|
|
82
81
|
if dtype is None:
|
|
@@ -84,50 +83,17 @@ class FPQuantHfQuantizer(HfQuantizer):
|
|
|
84
83
|
dtype = torch.bfloat16
|
|
85
84
|
elif dtype != torch.bfloat16:
|
|
86
85
|
raise ValueError(f"Invalid `dtype` {dtype}. fp_quant quantization only supports `dtype=torch.bfloat16`.")
|
|
87
|
-
|
|
88
86
|
return dtype
|
|
89
87
|
|
|
90
|
-
def
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
if target_device == "cpu" and param_name.endswith("weight"):
|
|
101
|
-
# Works agains hard-coded missing key dispatch to CPU
|
|
102
|
-
return
|
|
103
|
-
|
|
104
|
-
# The module holds either:
|
|
105
|
-
# * `weight` when `store_master_weights=True`
|
|
106
|
-
# * `qweight` and `scales` when `store_master_weights=False` and `pseudoquantization=False`
|
|
107
|
-
# * `dqweight` when `store_master_weights=False` and `pseudoquantization=True`
|
|
108
|
-
|
|
109
|
-
if param_name.endswith(".qweight"):
|
|
110
|
-
# Loading a real quantized checkpoint without master weights
|
|
111
|
-
module.qweight = torch.nn.Parameter(
|
|
112
|
-
param_value.to(target_device),
|
|
113
|
-
requires_grad=False,
|
|
114
|
-
)
|
|
115
|
-
module.weight = None
|
|
116
|
-
module.dqweight = None
|
|
117
|
-
return
|
|
118
|
-
|
|
119
|
-
if param_name.endswith(".dqweight"):
|
|
120
|
-
# Loading a pseudo-quantized checkpoint without master weights
|
|
121
|
-
module.dqweight = torch.nn.Parameter(param_value.to(target_device))
|
|
122
|
-
module.weight = None
|
|
123
|
-
module.qweight = None
|
|
124
|
-
module.scales = None
|
|
125
|
-
return
|
|
126
|
-
|
|
127
|
-
# Loading master weights or an unquantized checkpoint
|
|
128
|
-
module.weight = torch.nn.Parameter(param_value.to(target_device))
|
|
129
|
-
# Let pre-forward handle the quantization and set None where necessary
|
|
130
|
-
module.pre_forward()
|
|
88
|
+
def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
|
|
89
|
+
from fp_quant import FPQuantLinear
|
|
90
|
+
|
|
91
|
+
module, tensor_name = get_module_from_name(model, param_name)
|
|
92
|
+
if isinstance(module, FPQuantLinear) and tensor_name in ["weight", "qweight", "dqweight"]:
|
|
93
|
+
# Only quantize weights of FPQuantLinear modules that are not already quantized
|
|
94
|
+
return True
|
|
95
|
+
else:
|
|
96
|
+
return False
|
|
131
97
|
|
|
132
98
|
def _process_model_before_weight_loading(
|
|
133
99
|
self,
|
|
@@ -142,20 +108,6 @@ class FPQuantHfQuantizer(HfQuantizer):
|
|
|
142
108
|
model,
|
|
143
109
|
fp_quant_linear_config=adapt_fp_quant_config(self.quantization_config),
|
|
144
110
|
)
|
|
145
|
-
model.config.quantization_config = self.quantization_config
|
|
146
|
-
|
|
147
|
-
def update_missing_keys(self, model, missing_keys: list[str], prefix: str) -> list[str]:
|
|
148
|
-
from fp_quant import FPQuantLinear
|
|
149
|
-
|
|
150
|
-
fp_quant_names = {name for name, module in model.named_modules() if isinstance(module, FPQuantLinear)}
|
|
151
|
-
|
|
152
|
-
def should_exclude(key: str) -> bool:
|
|
153
|
-
if key.endswith(".weight") or key.endswith(".bias"):
|
|
154
|
-
return False
|
|
155
|
-
full_key = f"{prefix}.{key}"
|
|
156
|
-
return any(name in key or name in full_key for name in fp_quant_names)
|
|
157
|
-
|
|
158
|
-
return [key for key in missing_keys if not should_exclude(key)]
|
|
159
111
|
|
|
160
112
|
@property
|
|
161
113
|
def is_trainable(self, model: Optional["PreTrainedModel"] = None):
|
|
@@ -166,15 +118,33 @@ class FPQuantHfQuantizer(HfQuantizer):
|
|
|
166
118
|
)
|
|
167
119
|
return trainable
|
|
168
120
|
|
|
169
|
-
def is_serializable(self
|
|
121
|
+
def is_serializable(self):
|
|
170
122
|
return True
|
|
171
123
|
|
|
172
|
-
def
|
|
173
|
-
from fp_quant import
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
124
|
+
def get_quantize_ops(self):
|
|
125
|
+
from ..integrations.fp_quant import FpQuantQuantize
|
|
126
|
+
|
|
127
|
+
return FpQuantQuantize(self)
|
|
128
|
+
|
|
129
|
+
def get_weight_conversions(self):
|
|
130
|
+
from ..core_model_loading import WeightConverter
|
|
131
|
+
from ..integrations.fp_quant import FpQuantDeserialize
|
|
132
|
+
|
|
133
|
+
if self.pre_quantized:
|
|
134
|
+
if self.quantization_config.pseudoquantization:
|
|
135
|
+
return [
|
|
136
|
+
WeightConverter(
|
|
137
|
+
source_patterns=[".dqweight"],
|
|
138
|
+
target_patterns=".dqweight",
|
|
139
|
+
operations=[FpQuantDeserialize(self)],
|
|
140
|
+
),
|
|
141
|
+
]
|
|
142
|
+
else:
|
|
143
|
+
return [
|
|
144
|
+
WeightConverter(
|
|
145
|
+
source_patterns=[".qweight"],
|
|
146
|
+
target_patterns=".qweight",
|
|
147
|
+
operations=[FpQuantDeserialize(self)],
|
|
148
|
+
),
|
|
149
|
+
]
|
|
150
|
+
return []
|
|
@@ -22,7 +22,7 @@ from .base import HfQuantizer
|
|
|
22
22
|
if TYPE_CHECKING:
|
|
23
23
|
from ..modeling_utils import PreTrainedModel
|
|
24
24
|
|
|
25
|
-
from ..utils import
|
|
25
|
+
from ..utils import is_gptqmodel_available, is_optimum_available, is_torch_available, logging
|
|
26
26
|
from ..utils.quantization_config import GPTQConfig, QuantizationConfigMixin
|
|
27
27
|
|
|
28
28
|
|
|
@@ -35,12 +35,11 @@ logger = logging.get_logger(__name__)
|
|
|
35
35
|
class GptqHfQuantizer(HfQuantizer):
|
|
36
36
|
"""
|
|
37
37
|
Quantizer of the GPTQ method - for GPTQ the quantizer support calibration of the model through
|
|
38
|
-
|
|
38
|
+
the GPT-QModel package (Python import name `gptqmodel`). Quantization is done under the hood for users if they
|
|
39
|
+
load a non-prequantized model.
|
|
39
40
|
"""
|
|
40
41
|
|
|
41
42
|
requires_calibration = False
|
|
42
|
-
required_packages = ["optimum", "auto_gptq", "gptqmodel"]
|
|
43
|
-
optimum_quantizer = None
|
|
44
43
|
|
|
45
44
|
def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
|
|
46
45
|
super().__init__(quantization_config, **kwargs)
|
|
@@ -54,25 +53,12 @@ class GptqHfQuantizer(HfQuantizer):
|
|
|
54
53
|
def validate_environment(self, *args, **kwargs):
|
|
55
54
|
if not is_optimum_available():
|
|
56
55
|
raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)")
|
|
57
|
-
if is_auto_gptq_available() and is_gptqmodel_available():
|
|
58
|
-
logger.warning("Detected gptqmodel and auto-gptq, will use gptqmodel")
|
|
59
56
|
|
|
60
|
-
gptq_supports_cpu = (
|
|
61
|
-
is_auto_gptq_available()
|
|
62
|
-
and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")
|
|
63
|
-
) or is_gptqmodel_available()
|
|
57
|
+
gptq_supports_cpu = is_gptqmodel_available()
|
|
64
58
|
if not gptq_supports_cpu and not torch.cuda.is_available():
|
|
65
59
|
raise RuntimeError("GPU is required to quantize or run quantize model.")
|
|
66
|
-
elif not
|
|
67
|
-
raise ImportError(
|
|
68
|
-
"Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) or auto-gptq (`pip install auto-gptq`) library. "
|
|
69
|
-
)
|
|
70
|
-
elif is_auto_gptq_available() and version.parse(importlib.metadata.version("auto_gptq")) < version.parse(
|
|
71
|
-
"0.4.2"
|
|
72
|
-
):
|
|
73
|
-
raise ImportError(
|
|
74
|
-
"You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel>=1.4.3`."
|
|
75
|
-
)
|
|
60
|
+
elif not is_gptqmodel_available():
|
|
61
|
+
raise ImportError("Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) library.")
|
|
76
62
|
elif is_gptqmodel_available() and (
|
|
77
63
|
version.parse(importlib.metadata.version("gptqmodel")) < version.parse("1.4.3")
|
|
78
64
|
or version.parse(importlib.metadata.version("optimum")) < version.parse("1.23.99")
|
|
@@ -90,9 +76,6 @@ class GptqHfQuantizer(HfQuantizer):
|
|
|
90
76
|
def update_device_map(self, device_map):
|
|
91
77
|
if device_map is None:
|
|
92
78
|
device_map = {"": torch.device("cpu")}
|
|
93
|
-
# Only with auto-gptq do not support CPU, we should move the model to cuda if available.
|
|
94
|
-
if not is_gptqmodel_available() and device_map in ("cpu", {"": torch.device("cpu")}):
|
|
95
|
-
device_map = {"": 0}
|
|
96
79
|
return device_map
|
|
97
80
|
|
|
98
81
|
def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
|
|
@@ -120,5 +103,5 @@ class GptqHfQuantizer(HfQuantizer):
|
|
|
120
103
|
def is_trainable(self) -> bool:
|
|
121
104
|
return True
|
|
122
105
|
|
|
123
|
-
def is_serializable(self
|
|
106
|
+
def is_serializable(self):
|
|
124
107
|
return True
|
|
@@ -37,12 +37,9 @@ class HiggsHfQuantizer(HfQuantizer):
|
|
|
37
37
|
"""
|
|
38
38
|
|
|
39
39
|
requires_calibration = False
|
|
40
|
-
requires_parameters_quantization = True
|
|
41
|
-
required_packages = ["flute-kernel", "fast_hadamard_transform"]
|
|
42
40
|
|
|
43
41
|
def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
|
|
44
42
|
super().__init__(quantization_config, **kwargs)
|
|
45
|
-
self.quantization_config = quantization_config
|
|
46
43
|
|
|
47
44
|
def validate_environment(self, device_map, **kwargs):
|
|
48
45
|
if not torch.cuda.is_available():
|
|
@@ -64,11 +61,12 @@ class HiggsHfQuantizer(HfQuantizer):
|
|
|
64
61
|
"You are attempting to load a HIGGS model without setting device_map."
|
|
65
62
|
" Please set device_map comprised of 'cuda' devices."
|
|
66
63
|
)
|
|
67
|
-
elif isinstance(device_map, dict)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
64
|
+
elif isinstance(device_map, dict):
|
|
65
|
+
if "cpu" in device_map.values() or "disk" in device_map.values():
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"You are attempting to load a HIGGS model with a device_map that contains a CPU or disk device."
|
|
68
|
+
" This is not supported. Please remove the CPU or disk device from the device_map."
|
|
69
|
+
)
|
|
72
70
|
|
|
73
71
|
def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
|
|
74
72
|
if dtype is None:
|
|
@@ -81,37 +79,39 @@ class HiggsHfQuantizer(HfQuantizer):
|
|
|
81
79
|
|
|
82
80
|
return dtype
|
|
83
81
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
82
|
+
# TODO: to remove
|
|
83
|
+
# Kept here in case we see some interest in adding support for it
|
|
84
|
+
# def create_quantized_param(
|
|
85
|
+
# self,
|
|
86
|
+
# model: "PreTrainedModel",
|
|
87
|
+
# param_value: "torch.Tensor",
|
|
88
|
+
# param_name: str,
|
|
89
|
+
# target_device: "torch.device",
|
|
90
|
+
# **kwargs,
|
|
91
|
+
# ):
|
|
92
|
+
# from ..integrations import quantize_with_higgs
|
|
93
|
+
|
|
94
|
+
# flute_dict = quantize_with_higgs(
|
|
95
|
+
# param_value.to(target_device),
|
|
96
|
+
# self.quantization_config.bits,
|
|
97
|
+
# self.quantization_config.p,
|
|
98
|
+
# self.quantization_config.group_size,
|
|
99
|
+
# self.quantization_config.hadamard_size,
|
|
100
|
+
# )
|
|
101
|
+
# del param_value
|
|
102
|
+
|
|
103
|
+
# module, _ = get_module_from_name(model, param_name)
|
|
104
|
+
# module_name = ".".join(param_name.split(".")[:-1])
|
|
105
|
+
# for key, value in flute_dict.items():
|
|
106
|
+
# if key in module._parameters:
|
|
107
|
+
# module._parameters[key] = torch.nn.Parameter(value, requires_grad=False)
|
|
108
|
+
# elif key in module._buffers:
|
|
109
|
+
# module._buffers[key] = torch.nn.Buffer(value)
|
|
110
|
+
# elif key == "tune_metadata":
|
|
111
|
+
# module.tune_metadata = value
|
|
112
|
+
# self.quantization_config.tune_metadata[module_name] = value.to_dict()
|
|
113
|
+
# else:
|
|
114
|
+
# raise ValueError(f"Unexpected key {key} in module {module}")
|
|
115
115
|
|
|
116
116
|
def _process_model_before_weight_loading(
|
|
117
117
|
self,
|
|
@@ -130,7 +130,6 @@ class HiggsHfQuantizer(HfQuantizer):
|
|
|
130
130
|
quantization_config=self.quantization_config,
|
|
131
131
|
modules_to_not_convert=self.modules_to_not_convert,
|
|
132
132
|
)
|
|
133
|
-
model.config.quantization_config = self.quantization_config
|
|
134
133
|
|
|
135
134
|
def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
|
|
136
135
|
from flute.tune import TuneMetaData, maybe_tune_and_repack
|
|
@@ -157,24 +156,11 @@ class HiggsHfQuantizer(HfQuantizer):
|
|
|
157
156
|
)
|
|
158
157
|
self.quantization_config.tune_metadata[name] = module.tune_metadata.to_dict()
|
|
159
158
|
|
|
160
|
-
def update_missing_keys(self, model, missing_keys: list[str], prefix: str) -> list[str]:
|
|
161
|
-
from ..integrations import HiggsLinear
|
|
162
|
-
|
|
163
|
-
higgs_names = {name for name, module in model.named_modules() if isinstance(module, HiggsLinear)}
|
|
164
|
-
|
|
165
|
-
def should_update(key: str) -> bool:
|
|
166
|
-
if key.endswith(".weight") or key.endswith(".bias"):
|
|
167
|
-
return False
|
|
168
|
-
full_key = f"{prefix}.{key}"
|
|
169
|
-
return any(name in key or name in full_key for name in higgs_names)
|
|
170
|
-
|
|
171
|
-
return [key for key in missing_keys if not should_update(key)]
|
|
172
|
-
|
|
173
159
|
@property
|
|
174
160
|
def is_trainable(self) -> bool:
|
|
175
161
|
return False
|
|
176
162
|
|
|
177
|
-
def is_serializable(self
|
|
163
|
+
def is_serializable(self):
|
|
178
164
|
return True
|
|
179
165
|
|
|
180
166
|
def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
|