transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
transformers/utils/hub.py
CHANGED
|
@@ -37,6 +37,7 @@ from huggingface_hub import (
|
|
|
37
37
|
create_repo,
|
|
38
38
|
hf_hub_download,
|
|
39
39
|
hf_hub_url,
|
|
40
|
+
is_offline_mode,
|
|
40
41
|
list_repo_tree,
|
|
41
42
|
snapshot_download,
|
|
42
43
|
try_to_load_from_cache,
|
|
@@ -83,13 +84,6 @@ class DownloadKwargs(TypedDict, total=False):
|
|
|
83
84
|
commit_hash: str | None
|
|
84
85
|
|
|
85
86
|
|
|
86
|
-
def is_offline_mode():
|
|
87
|
-
# Import inside the function so test patches on `huggingface_hub.constants` are picked up.
|
|
88
|
-
from huggingface_hub import constants as hf_hub_constants
|
|
89
|
-
|
|
90
|
-
return hf_hub_constants.HF_HUB_OFFLINE
|
|
91
|
-
|
|
92
|
-
|
|
93
87
|
# Determine default cache directory.
|
|
94
88
|
# The best way to set the cache path is with the environment variable HF_HOME. For more details, check out this
|
|
95
89
|
# documentation page: https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables.
|
|
@@ -727,8 +721,7 @@ class PushToHubMixin:
|
|
|
727
721
|
revision: str | None = None,
|
|
728
722
|
create_pr: bool = False,
|
|
729
723
|
# Serialization details
|
|
730
|
-
max_shard_size: int | str | None = "
|
|
731
|
-
safe_serialization: bool = True,
|
|
724
|
+
max_shard_size: int | str | None = "50GB",
|
|
732
725
|
tags: list[str] | None = None,
|
|
733
726
|
) -> str:
|
|
734
727
|
"""
|
|
@@ -751,13 +744,10 @@ class PushToHubMixin:
|
|
|
751
744
|
Branch to push the uploaded files to.
|
|
752
745
|
create_pr (`bool`, *optional*, defaults to `False`):
|
|
753
746
|
Whether or not to create a PR with the uploaded files or directly commit.
|
|
754
|
-
max_shard_size (`int` or `str`, *optional*, defaults to `"
|
|
747
|
+
max_shard_size (`int` or `str`, *optional*, defaults to `"50GB"`):
|
|
755
748
|
Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard
|
|
756
749
|
will then be each of size lower than this size. If expressed as a string, needs to be digits followed
|
|
757
|
-
by a unit (like `"5MB"`).
|
|
758
|
-
Google Colab instances without any CPU OOM issues.
|
|
759
|
-
safe_serialization (`bool`, *optional*, defaults to `True`):
|
|
760
|
-
Whether or not to convert the model weights in safetensors format for safer serialization.
|
|
750
|
+
by a unit (like `"5MB"`).
|
|
761
751
|
tags (`list[str]`, *optional*):
|
|
762
752
|
List of tags to push on the Hub.
|
|
763
753
|
|
|
@@ -783,7 +773,7 @@ class PushToHubMixin:
|
|
|
783
773
|
|
|
784
774
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
785
775
|
# Save all files.
|
|
786
|
-
self.save_pretrained(tmp_dir, max_shard_size=max_shard_size
|
|
776
|
+
self.save_pretrained(tmp_dir, max_shard_size=max_shard_size)
|
|
787
777
|
|
|
788
778
|
# Update model card
|
|
789
779
|
model_card.save(os.path.join(tmp_dir, "README.md"))
|
|
@@ -55,9 +55,15 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> tuple[
|
|
|
55
55
|
# importlib.metadata works with the distribution package, which may be different from the import
|
|
56
56
|
# name (e.g. `PIL` is the import name, but `pillow` is the distribution name)
|
|
57
57
|
distributions = PACKAGE_DISTRIBUTION_MAPPING[pkg_name]
|
|
58
|
-
#
|
|
59
|
-
#
|
|
60
|
-
|
|
58
|
+
# Per PEP 503, underscores and hyphens are equivalent in package names.
|
|
59
|
+
# Prefer the distribution that matches the (normalized) package name.
|
|
60
|
+
normalized_pkg_name = pkg_name.replace("_", "-")
|
|
61
|
+
if normalized_pkg_name in distributions:
|
|
62
|
+
distribution_name = normalized_pkg_name
|
|
63
|
+
elif pkg_name in distributions:
|
|
64
|
+
distribution_name = pkg_name
|
|
65
|
+
else:
|
|
66
|
+
distribution_name = distributions[0]
|
|
61
67
|
package_version = importlib.metadata.version(distribution_name)
|
|
62
68
|
except (importlib.metadata.PackageNotFoundError, KeyError):
|
|
63
69
|
# If we cannot find the metadata (because of editable install for example), try to import directly.
|
|
@@ -71,6 +77,16 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> tuple[
|
|
|
71
77
|
return package_exists
|
|
72
78
|
|
|
73
79
|
|
|
80
|
+
def is_env_variable_true(env_variable: str) -> bool:
|
|
81
|
+
"""Detect whether `env_variable` has been set to a true value in the environment"""
|
|
82
|
+
return os.getenv(env_variable, "false").lower() in ("true", "1", "y", "yes", "on")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def is_env_variable_false(env_variable: str) -> bool:
|
|
86
|
+
"""Detect whether `env_variable` has been set to a false value in the environment"""
|
|
87
|
+
return os.getenv(env_variable, "true").lower() in ("false", "0", "n", "no", "off")
|
|
88
|
+
|
|
89
|
+
|
|
74
90
|
ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
|
|
75
91
|
ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
|
|
76
92
|
|
|
@@ -536,6 +552,11 @@ def is_torch_flex_attn_available() -> bool:
|
|
|
536
552
|
return is_torch_available() and version.parse(get_torch_version()) >= version.parse("2.5.0")
|
|
537
553
|
|
|
538
554
|
|
|
555
|
+
@lru_cache
|
|
556
|
+
def is_grouped_mm_available() -> bool:
|
|
557
|
+
return is_torch_available() and version.parse(get_torch_version()) >= version.parse("2.9.0")
|
|
558
|
+
|
|
559
|
+
|
|
539
560
|
@lru_cache
|
|
540
561
|
def is_kenlm_available() -> bool:
|
|
541
562
|
return _is_package_available("kenlm")
|
|
@@ -869,14 +890,17 @@ def is_flash_attn_2_available() -> bool:
|
|
|
869
890
|
|
|
870
891
|
import torch
|
|
871
892
|
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
893
|
+
try:
|
|
894
|
+
if torch.version.cuda:
|
|
895
|
+
return version.parse(flash_attn_version) >= version.parse("2.1.0")
|
|
896
|
+
elif torch.version.hip:
|
|
897
|
+
# TODO: Bump the requirement to 2.1.0 once released in https://github.com/ROCmSoftwarePlatform/flash-attention
|
|
898
|
+
return version.parse(flash_attn_version) >= version.parse("2.0.4")
|
|
899
|
+
elif is_torch_mlu_available():
|
|
900
|
+
return version.parse(flash_attn_version) >= version.parse("2.3.3")
|
|
901
|
+
else:
|
|
902
|
+
return False
|
|
903
|
+
except packaging.version.InvalidVersion:
|
|
880
904
|
return False
|
|
881
905
|
|
|
882
906
|
|
|
@@ -894,7 +918,12 @@ def is_flash_attn_greater_or_equal_2_10() -> bool:
|
|
|
894
918
|
@lru_cache
|
|
895
919
|
def is_flash_attn_greater_or_equal(library_version: str) -> bool:
|
|
896
920
|
is_available, flash_attn_version = _is_package_available("flash_attn", return_version=True)
|
|
897
|
-
|
|
921
|
+
if not is_available:
|
|
922
|
+
return False
|
|
923
|
+
try:
|
|
924
|
+
return version.parse(flash_attn_version) >= version.parse(library_version)
|
|
925
|
+
except packaging.version.InvalidVersion:
|
|
926
|
+
return False
|
|
898
927
|
|
|
899
928
|
|
|
900
929
|
@lru_cache
|
|
@@ -978,7 +1007,7 @@ def is_optimum_available() -> bool:
|
|
|
978
1007
|
|
|
979
1008
|
|
|
980
1009
|
@lru_cache
|
|
981
|
-
def
|
|
1010
|
+
def is_llm_awq_available() -> bool:
|
|
982
1011
|
return _is_package_available("awq")
|
|
983
1012
|
|
|
984
1013
|
|
|
@@ -1015,21 +1044,11 @@ def is_compressed_tensors_available() -> bool:
|
|
|
1015
1044
|
return _is_package_available("compressed_tensors")
|
|
1016
1045
|
|
|
1017
1046
|
|
|
1018
|
-
@lru_cache
|
|
1019
|
-
def is_auto_gptq_available() -> bool:
|
|
1020
|
-
return _is_package_available("auto_gptq")
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
1047
|
@lru_cache
|
|
1024
1048
|
def is_gptqmodel_available() -> bool:
|
|
1025
1049
|
return _is_package_available("gptqmodel")
|
|
1026
1050
|
|
|
1027
1051
|
|
|
1028
|
-
@lru_cache
|
|
1029
|
-
def is_eetq_available() -> bool:
|
|
1030
|
-
return _is_package_available("eetq")
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
1052
|
@lru_cache
|
|
1034
1053
|
def is_fbgemm_gpu_available() -> bool:
|
|
1035
1054
|
return _is_package_available("fbgemm_gpu")
|
|
@@ -1065,6 +1084,11 @@ def is_pytest_available() -> bool:
|
|
|
1065
1084
|
return _is_package_available("pytest")
|
|
1066
1085
|
|
|
1067
1086
|
|
|
1087
|
+
@lru_cache
|
|
1088
|
+
def is_pytest_order_available() -> bool:
|
|
1089
|
+
return is_pytest_available() and _is_package_available("pytest_order")
|
|
1090
|
+
|
|
1091
|
+
|
|
1068
1092
|
@lru_cache
|
|
1069
1093
|
def is_spacy_available() -> bool:
|
|
1070
1094
|
return _is_package_available("spacy")
|
|
@@ -1100,6 +1124,16 @@ def is_nltk_available() -> bool:
|
|
|
1100
1124
|
return _is_package_available("nltk")
|
|
1101
1125
|
|
|
1102
1126
|
|
|
1127
|
+
@lru_cache
|
|
1128
|
+
def is_numba_available() -> bool:
|
|
1129
|
+
is_available = _is_package_available("numba")
|
|
1130
|
+
if not is_available:
|
|
1131
|
+
return False
|
|
1132
|
+
|
|
1133
|
+
numpy_available, numpy_version = _is_package_available("numpy", return_version=True)
|
|
1134
|
+
return not numpy_available or version.parse(numpy_version) < version.parse("2.2.0")
|
|
1135
|
+
|
|
1136
|
+
|
|
1103
1137
|
@lru_cache
|
|
1104
1138
|
def is_torchaudio_available() -> bool:
|
|
1105
1139
|
return _is_package_available("torchaudio")
|
|
@@ -1297,6 +1331,34 @@ def is_torch_fx_proxy(x):
|
|
|
1297
1331
|
return False
|
|
1298
1332
|
|
|
1299
1333
|
|
|
1334
|
+
def is_jax_jitting(x):
|
|
1335
|
+
"""returns True if we are inside of `jax.jit` context, False otherwise.
|
|
1336
|
+
|
|
1337
|
+
When a torch model is being compiled with `jax.jit` using torchax,
|
|
1338
|
+
the tensor that goes through the model would be an instance of
|
|
1339
|
+
`torchax.tensor.Tensor`, which is a tensor subclass. This tensor has
|
|
1340
|
+
a `jax` method to return the inner Jax array
|
|
1341
|
+
(https://github.com/google/torchax/blob/13ce870a1d9adb2430333c27bb623469e3aea34e/torchax/tensor.py#L134).
|
|
1342
|
+
Here we use ducktyping to detect if the inner jax array is a jax Tracer
|
|
1343
|
+
then we are in tracing context. (See more at: https://github.com/jax-ml/jax/discussions/9241)
|
|
1344
|
+
|
|
1345
|
+
Args:
|
|
1346
|
+
x: torch.Tensor
|
|
1347
|
+
|
|
1348
|
+
Returns:
|
|
1349
|
+
bool: whether we are inside of jax jit tracing.
|
|
1350
|
+
"""
|
|
1351
|
+
|
|
1352
|
+
if not hasattr(x, "jax"):
|
|
1353
|
+
return False
|
|
1354
|
+
try:
|
|
1355
|
+
import jax
|
|
1356
|
+
|
|
1357
|
+
return isinstance(x.jax(), jax.core.Tracer)
|
|
1358
|
+
except Exception:
|
|
1359
|
+
return False
|
|
1360
|
+
|
|
1361
|
+
|
|
1300
1362
|
def is_jit_tracing() -> bool:
|
|
1301
1363
|
try:
|
|
1302
1364
|
import torch
|
|
@@ -1306,13 +1368,24 @@ def is_jit_tracing() -> bool:
|
|
|
1306
1368
|
return False
|
|
1307
1369
|
|
|
1308
1370
|
|
|
1371
|
+
def is_cuda_stream_capturing() -> bool:
|
|
1372
|
+
try:
|
|
1373
|
+
import torch
|
|
1374
|
+
|
|
1375
|
+
return torch.cuda.is_current_stream_capturing()
|
|
1376
|
+
except Exception:
|
|
1377
|
+
return False
|
|
1378
|
+
|
|
1379
|
+
|
|
1309
1380
|
def is_tracing(tensor=None) -> bool:
|
|
1310
|
-
"""Checks whether we are tracing a graph with dynamo (compile or export), torch.jit,
|
|
1381
|
+
"""Checks whether we are tracing a graph with dynamo (compile or export), torch.jit, torch.fx, jax.jit (with torchax) or
|
|
1382
|
+
CUDA stream capturing"""
|
|
1311
1383
|
# Note that `is_torchdynamo_compiling` checks both compiling and exporting (the export check is stricter and
|
|
1312
1384
|
# only checks export)
|
|
1313
|
-
_is_tracing = is_torchdynamo_compiling() or is_jit_tracing()
|
|
1385
|
+
_is_tracing = is_torchdynamo_compiling() or is_jit_tracing() or is_cuda_stream_capturing()
|
|
1314
1386
|
if tensor is not None:
|
|
1315
1387
|
_is_tracing |= is_torch_fx_proxy(tensor)
|
|
1388
|
+
_is_tracing |= is_jax_jitting(tensor)
|
|
1316
1389
|
return _is_tracing
|
|
1317
1390
|
|
|
1318
1391
|
|
|
@@ -1780,6 +1853,20 @@ BACKENDS_MAPPING = OrderedDict(
|
|
|
1780
1853
|
|
|
1781
1854
|
|
|
1782
1855
|
def requires_backends(obj, backends):
|
|
1856
|
+
"""
|
|
1857
|
+
Method that automatically raises in case the specified backends are not available. It is often used during class
|
|
1858
|
+
initialization to ensure the required dependencies are installed:
|
|
1859
|
+
|
|
1860
|
+
```py
|
|
1861
|
+
requires_backends(self, ["torch"])
|
|
1862
|
+
```
|
|
1863
|
+
|
|
1864
|
+
The backends should be defined in the `BACKEND_MAPPING` defined in `transformers.utils.import_utils`.
|
|
1865
|
+
|
|
1866
|
+
Args:
|
|
1867
|
+
obj: object to be checked
|
|
1868
|
+
backends: list or tuple of backends to check.
|
|
1869
|
+
"""
|
|
1783
1870
|
if not isinstance(backends, (list, tuple)):
|
|
1784
1871
|
backends = [backends]
|
|
1785
1872
|
|
|
@@ -71,14 +71,36 @@ def add_to_mapping(layer_name, device, repo_name, mode, compatible_mapping):
|
|
|
71
71
|
}
|
|
72
72
|
|
|
73
73
|
|
|
74
|
+
def add_to_mapping_local(layer_name, device, repo_name, mode, compatible_mapping):
|
|
75
|
+
from pathlib import Path
|
|
76
|
+
|
|
77
|
+
from kernels import LocalLayerRepository
|
|
78
|
+
|
|
79
|
+
if device not in ["cuda", "rocm", "xpu", "npu"]:
|
|
80
|
+
raise ValueError(f"Only cuda, rocm, xpu and npu devices supported, got: {device}")
|
|
81
|
+
repo_layer_name = repo_name.split(":")[1]
|
|
82
|
+
repo_path = repo_name.split(":")[0]
|
|
83
|
+
repo_package_name = repo_path.split("/")[-1]
|
|
84
|
+
compatible_mapping[layer_name] = {
|
|
85
|
+
device: {
|
|
86
|
+
mode: LocalLayerRepository(
|
|
87
|
+
repo_path=Path(repo_path),
|
|
88
|
+
package_name=repo_package_name,
|
|
89
|
+
layer_name=repo_layer_name,
|
|
90
|
+
)
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
74
95
|
class KernelConfig(PushToHubMixin):
|
|
75
96
|
"""
|
|
76
97
|
Kernel configuration class. This class is used to configure the kernel mapping for a model.
|
|
77
98
|
"""
|
|
78
99
|
|
|
79
|
-
def __init__(self, kernel_mapping={}):
|
|
100
|
+
def __init__(self, kernel_mapping={}, use_local_kernel=False):
|
|
80
101
|
self.kernel_mapping = kernel_mapping
|
|
81
102
|
self.registered_layer_names = {}
|
|
103
|
+
self.use_local_kernel = use_local_kernel
|
|
82
104
|
|
|
83
105
|
def update_kernel(self, repo_id, registered_name, layer_name, device, mode, revision=None):
|
|
84
106
|
from kernels import LayerRepository
|
|
@@ -105,6 +127,7 @@ class KernelConfig(PushToHubMixin):
|
|
|
105
127
|
2. Each kernel value is either a string of the form 'org/repo:layer_name' or a dict mapping device types ("cuda", "rocm", "xpu", "npu") to such strings.
|
|
106
128
|
3. Each device key in a dict is one of "cuda", "rocm", "xpu", or "npu".
|
|
107
129
|
4. Each repo_name is a valid repository and layer name in the format 'org/repo:layer_name' (i.e., a string containing both a slash and a colon).
|
|
130
|
+
5. If a local path is detected, it should be in the format '/abs/path:layer_name'. The absolute path must include the `package_name`, like "/home/user/layer_norm".
|
|
108
131
|
|
|
109
132
|
Args:
|
|
110
133
|
model: The model instance whose modules are checked for registered kernel_layer_name attributes.
|
|
@@ -114,14 +137,13 @@ class KernelConfig(PushToHubMixin):
|
|
|
114
137
|
or if a repo_name is not a valid 'org/repo:layer_name' string.
|
|
115
138
|
"""
|
|
116
139
|
MAPPING_FORMAT = """
|
|
140
|
+
For single device form remote
|
|
117
141
|
{
|
|
118
142
|
"RMSNorm":
|
|
119
143
|
"kernels-community/layer_norm:LlamaRMSNorm",
|
|
120
144
|
...
|
|
121
145
|
},
|
|
122
|
-
|
|
123
|
-
or
|
|
124
|
-
|
|
146
|
+
For multiple devices form remote
|
|
125
147
|
{
|
|
126
148
|
"RMSNorm": {
|
|
127
149
|
"cuda":
|
|
@@ -132,6 +154,23 @@ class KernelConfig(PushToHubMixin):
|
|
|
132
154
|
},
|
|
133
155
|
...
|
|
134
156
|
}
|
|
157
|
+
For single device form local
|
|
158
|
+
{
|
|
159
|
+
"RMSNorm":
|
|
160
|
+
"/abs/path:LlamaRMSNorm",
|
|
161
|
+
...
|
|
162
|
+
},
|
|
163
|
+
For multiple devices form local
|
|
164
|
+
{
|
|
165
|
+
"RMSNorm": {
|
|
166
|
+
"cuda":
|
|
167
|
+
"/abs/path:LlamaRMSNorm",
|
|
168
|
+
"rocm":
|
|
169
|
+
"/abs/path:LlamaRMSNorm",
|
|
170
|
+
...
|
|
171
|
+
},
|
|
172
|
+
...
|
|
173
|
+
}
|
|
135
174
|
"""
|
|
136
175
|
self.store_registered_layer_names(model)
|
|
137
176
|
# Validate that the kernel mapping is a dict
|
|
@@ -149,7 +188,7 @@ class KernelConfig(PushToHubMixin):
|
|
|
149
188
|
if isinstance(kernel, str):
|
|
150
189
|
if "/" not in kernel or ":" not in kernel:
|
|
151
190
|
raise ValueError(
|
|
152
|
-
f"Kernel mapping for '{layer_name}' must be a valid repo name with a layer name (e.g., 'org/repo:layer_name'), got: {kernel}"
|
|
191
|
+
f"Kernel mapping for '{layer_name}' must be a valid repo name with a layer name (e.g., 'org/repo:layer_name' or '/abs/path:layer_name'), got: {kernel}"
|
|
153
192
|
)
|
|
154
193
|
|
|
155
194
|
elif isinstance(kernel, dict):
|
|
@@ -159,9 +198,8 @@ class KernelConfig(PushToHubMixin):
|
|
|
159
198
|
|
|
160
199
|
if not isinstance(repo_name, str) or "/" not in repo_name or ":" not in repo_name:
|
|
161
200
|
raise ValueError(
|
|
162
|
-
f"Kernel mapping for '{layer_name}' must be a valid repo name with a layer name (e.g., 'org/repo:layer_name'), got: {repo_name}"
|
|
201
|
+
f"Kernel mapping for '{layer_name}' must be a valid repo name with a layer name (e.g., 'org/repo:layer_name' or '/abs/path:layer_name'), got: {repo_name}"
|
|
163
202
|
)
|
|
164
|
-
|
|
165
203
|
else:
|
|
166
204
|
raise ValueError(f"Kernel mapping must follow the format: {MAPPING_FORMAT}, got: {kernel}")
|
|
167
205
|
|
|
@@ -174,18 +212,13 @@ class KernelConfig(PushToHubMixin):
|
|
|
174
212
|
...
|
|
175
213
|
},
|
|
176
214
|
|
|
177
|
-
or
|
|
215
|
+
or for local path:
|
|
178
216
|
|
|
179
217
|
{
|
|
180
|
-
"RMSNorm":
|
|
181
|
-
"
|
|
182
|
-
"kernels-community/layer_norm:LlamaRMSNorm",
|
|
183
|
-
"rocm":
|
|
184
|
-
"kernels-community/layer_norm:LlamaRMSNorm",
|
|
185
|
-
...
|
|
186
|
-
},
|
|
218
|
+
"RMSNorm":
|
|
219
|
+
"/home/user/liger_kernels:LigerRMSNorm",
|
|
187
220
|
...
|
|
188
|
-
}
|
|
221
|
+
},
|
|
189
222
|
|
|
190
223
|
into a nested mapping:
|
|
191
224
|
|
|
@@ -200,6 +233,20 @@ class KernelConfig(PushToHubMixin):
|
|
|
200
233
|
}
|
|
201
234
|
}
|
|
202
235
|
|
|
236
|
+
or for local path:
|
|
237
|
+
|
|
238
|
+
{
|
|
239
|
+
"RMSNorm": {
|
|
240
|
+
"cuda": {
|
|
241
|
+
Mode.INFERENCE: LocalLayerRepository(
|
|
242
|
+
repo_path=Path("/home/user/liger_kernels"),
|
|
243
|
+
package_name="liger_kernels",
|
|
244
|
+
layer_name="LigerRMSNorm",
|
|
245
|
+
)
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
203
250
|
that's compatible with the kernels library.
|
|
204
251
|
|
|
205
252
|
The device is inferred from the model's parameters if not provided.
|
|
@@ -208,6 +255,7 @@ class KernelConfig(PushToHubMixin):
|
|
|
208
255
|
from kernels import Mode
|
|
209
256
|
|
|
210
257
|
compatible_mapping = {}
|
|
258
|
+
current_device = infer_device(model)
|
|
211
259
|
for layer_name, kernel in self.kernel_mapping.items():
|
|
212
260
|
# Infer Mode: use Mode.TRAINING if model is training, else use Mode.INFERENCE
|
|
213
261
|
mode = Mode.TRAINING if model.training else Mode.INFERENCE
|
|
@@ -216,10 +264,17 @@ class KernelConfig(PushToHubMixin):
|
|
|
216
264
|
|
|
217
265
|
if isinstance(kernel, str):
|
|
218
266
|
repo_name = kernel
|
|
219
|
-
|
|
220
|
-
|
|
267
|
+
if not self.use_local_kernel:
|
|
268
|
+
add_to_mapping(layer_name, current_device, repo_name, mode, compatible_mapping)
|
|
269
|
+
else:
|
|
270
|
+
add_to_mapping_local(layer_name, current_device, repo_name, mode, compatible_mapping)
|
|
221
271
|
elif isinstance(kernel, dict):
|
|
222
272
|
for device, repo_name in kernel.items():
|
|
223
|
-
|
|
273
|
+
if device != current_device:
|
|
274
|
+
continue
|
|
275
|
+
if not self.use_local_kernel:
|
|
276
|
+
add_to_mapping(layer_name, device, repo_name, mode, compatible_mapping)
|
|
277
|
+
else:
|
|
278
|
+
add_to_mapping_local(layer_name, device, repo_name, mode, compatible_mapping)
|
|
224
279
|
|
|
225
280
|
self.kernel_mapping = compatible_mapping
|
|
@@ -148,9 +148,8 @@ def log_state_dict_report(
|
|
|
148
148
|
mismatched_keys=None,
|
|
149
149
|
mismatched_shapes=None,
|
|
150
150
|
ignore_mismatched_sizes=True,
|
|
151
|
-
|
|
151
|
+
conversion_errors=None,
|
|
152
152
|
color=True, # allow disabling for plain logs
|
|
153
|
-
min_width_full_table=60, # terminal min width to attempt full table
|
|
154
153
|
):
|
|
155
154
|
"""Log a readable report about state_dict loading issues.
|
|
156
155
|
|
|
@@ -165,12 +164,13 @@ def log_state_dict_report(
|
|
|
165
164
|
missing_keys = missing_keys or []
|
|
166
165
|
mismatched_keys = mismatched_keys or []
|
|
167
166
|
mismatched_shapes = mismatched_shapes or []
|
|
168
|
-
|
|
167
|
+
conversion_errors = conversion_errors or {}
|
|
169
168
|
|
|
170
169
|
# Detect whether the current stdout supports ANSI colors; allow callers to pass `color=False` to force no color
|
|
171
170
|
color_enabled = bool(color and sys.stdout.isatty())
|
|
172
171
|
ansi = ANSI(color_enabled)
|
|
173
172
|
|
|
173
|
+
# Re-raise errors early if needed
|
|
174
174
|
if error_msgs:
|
|
175
175
|
error_msg = "\n\t".join(error_msgs)
|
|
176
176
|
if "size mismatch" in error_msg:
|
|
@@ -204,9 +204,9 @@ def log_state_dict_report(
|
|
|
204
204
|
)
|
|
205
205
|
rows.append(data)
|
|
206
206
|
|
|
207
|
-
if
|
|
208
|
-
for k, v in update_key_name(
|
|
209
|
-
status = "
|
|
207
|
+
if conversion_errors:
|
|
208
|
+
for k, v in update_key_name(conversion_errors).items():
|
|
209
|
+
status = "CONVERSION"
|
|
210
210
|
status = _color(status, "purple", ansi)
|
|
211
211
|
_details = v[:term_w]
|
|
212
212
|
rows.append([k, status, _details])
|
|
@@ -228,16 +228,25 @@ def log_state_dict_report(
|
|
|
228
228
|
if unexpected_keys:
|
|
229
229
|
tips += f"\n- {_color('UNEXPECTED', 'orange', ansi) + ansi['italic']}\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch."
|
|
230
230
|
if missing_keys:
|
|
231
|
-
tips += f"\n- {_color('MISSING', 'red', ansi) + ansi['italic']}\t:those params were newly initialized because missing
|
|
231
|
+
tips += f"\n- {_color('MISSING', 'red', ansi) + ansi['italic']}\t:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task."
|
|
232
232
|
if mismatched_keys:
|
|
233
|
-
tips += f"\n- {_color('MISMATCH', 'yellow', ansi) + ansi['italic']}\t:ckpt weights were loaded, but they did not match the original empty weight."
|
|
234
|
-
if
|
|
235
|
-
tips += f"\n- {_color('
|
|
233
|
+
tips += f"\n- {_color('MISMATCH', 'yellow', ansi) + ansi['italic']}\t:ckpt weights were loaded, but they did not match the original empty weight shapes."
|
|
234
|
+
if conversion_errors:
|
|
235
|
+
tips += f"\n- {_color('CONVERSION', 'purple', ansi) + ansi['italic']}\t:originate from the conversion scheme"
|
|
236
236
|
tips += f"{ansi['reset']}"
|
|
237
237
|
|
|
238
|
+
# Log the report as warning
|
|
238
239
|
logger.warning(prelude + table + tips)
|
|
240
|
+
|
|
241
|
+
# Re-raise in those case, after the report
|
|
242
|
+
if conversion_errors:
|
|
243
|
+
raise RuntimeError(
|
|
244
|
+
"We encountered some issues during automatic conversion of the weights. For details look at the `CONVERSION` entries of "
|
|
245
|
+
"the above report!"
|
|
246
|
+
)
|
|
239
247
|
if not ignore_mismatched_sizes and mismatched_keys:
|
|
240
248
|
raise RuntimeError(
|
|
241
249
|
"You set `ignore_mismatched_sizes` to `False`, thus raising an error. For details look at the above report!"
|
|
242
250
|
)
|
|
251
|
+
|
|
243
252
|
return prelude + table + tips
|