transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -13,6 +13,10 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"FP-Quant integration file"
|
|
15
15
|
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
import torch
|
|
19
|
+
|
|
16
20
|
from ..utils import (
|
|
17
21
|
is_fp_quant_available,
|
|
18
22
|
)
|
|
@@ -24,6 +28,94 @@ if is_fp_quant_available():
|
|
|
24
28
|
|
|
25
29
|
from transformers.utils.quantization_config import FPQuantConfig
|
|
26
30
|
|
|
31
|
+
from ..core_model_loading import ConversionOps
|
|
32
|
+
from ..quantizers.quantizers_utils import get_module_from_name
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class FpQuantQuantize(ConversionOps):
|
|
36
|
+
def __init__(self, hf_quantizer):
|
|
37
|
+
self.hf_quantizer = hf_quantizer
|
|
38
|
+
|
|
39
|
+
def convert(
|
|
40
|
+
self,
|
|
41
|
+
input_dict: torch.Tensor,
|
|
42
|
+
model: Optional[torch.nn.Module] = None,
|
|
43
|
+
missing_keys: Optional[list[str]] = None,
|
|
44
|
+
**kwargs,
|
|
45
|
+
) -> dict[str, torch.Tensor]:
|
|
46
|
+
target_key, value = tuple(input_dict.items())[0]
|
|
47
|
+
value = value[0]
|
|
48
|
+
# Loading master weights or an unquantized checkpoint
|
|
49
|
+
weight = torch.nn.Parameter(value)
|
|
50
|
+
module, _ = get_module_from_name(model, target_key)
|
|
51
|
+
module.weight = weight
|
|
52
|
+
|
|
53
|
+
# Let pre-forward handle the quantization and set None where necessary
|
|
54
|
+
# This operation will quantize the weights internally
|
|
55
|
+
with torch.cuda.device(value.device):
|
|
56
|
+
module.pre_forward()
|
|
57
|
+
|
|
58
|
+
prefix_target_key = target_key.rsplit(".", 1)[0]
|
|
59
|
+
|
|
60
|
+
# keys are set inside the module.pre_forward() method, we don't need remove them from the missing keys list
|
|
61
|
+
missing_keys.discard(target_key)
|
|
62
|
+
missing_keys.discard(f"{prefix_target_key}.backward_hadamard_matrix")
|
|
63
|
+
missing_keys.discard(f"{prefix_target_key}.forward_hadamard_matrix")
|
|
64
|
+
missing_keys.discard(f"{prefix_target_key}.act_global_scale")
|
|
65
|
+
missing_keys.discard(f"{prefix_target_key}.weight_global_scale")
|
|
66
|
+
missing_keys.discard(f"{prefix_target_key}.qweight")
|
|
67
|
+
missing_keys.discard(f"{prefix_target_key}.scales")
|
|
68
|
+
missing_keys.discard(f"{prefix_target_key}.dqweight")
|
|
69
|
+
return {}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class FpQuantDeserialize(ConversionOps):
|
|
73
|
+
def __init__(self, hf_quantizer):
|
|
74
|
+
self.hf_quantizer = hf_quantizer
|
|
75
|
+
|
|
76
|
+
def convert(
|
|
77
|
+
self,
|
|
78
|
+
input_dict: torch.Tensor,
|
|
79
|
+
model: Optional[torch.nn.Module] = None,
|
|
80
|
+
full_layer_name: str | None = None,
|
|
81
|
+
missing_keys: Optional[list[str]] = None,
|
|
82
|
+
**kwargs,
|
|
83
|
+
) -> dict[str, torch.Tensor]:
|
|
84
|
+
target_key, value = tuple(input_dict.items())[0]
|
|
85
|
+
value = value[0] if isinstance(value, list) else value
|
|
86
|
+
module, _ = get_module_from_name(model, target_key)
|
|
87
|
+
# The module holds either:
|
|
88
|
+
# * `weight` when `store_master_weights=True`
|
|
89
|
+
# * `qweight` and `scales` when `store_master_weights=False` and `pseudoquantization=False`
|
|
90
|
+
# * `dqweight` when `store_master_weights=False` and `pseudoquantization=True`
|
|
91
|
+
if target_key == ".qweight":
|
|
92
|
+
# Loading a real quantized checkpoint without master weights
|
|
93
|
+
qweight = torch.nn.Parameter(
|
|
94
|
+
value,
|
|
95
|
+
requires_grad=False,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return {
|
|
99
|
+
".qweight": qweight,
|
|
100
|
+
# the way the FPQuantLinear module is designed, these parameters are expected in the model
|
|
101
|
+
# even though they are not used so we need to set them to zeros
|
|
102
|
+
".weight": torch.nn.Parameter(torch.zeros(0)),
|
|
103
|
+
".dqweight": torch.nn.Parameter(torch.zeros(0)),
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if target_key == ".dqweight":
|
|
107
|
+
# Loading a pseudo-quantized checkpoint without master weights
|
|
108
|
+
dqweight = torch.nn.Parameter(value)
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
".dqweight": dqweight,
|
|
112
|
+
# the way the FPQuantLinear module ips designed, these parameters are expected in the model
|
|
113
|
+
# even though they are not used so we need to set them to zeros
|
|
114
|
+
".weight": torch.nn.Parameter(torch.zeros(0)),
|
|
115
|
+
".qweight": torch.nn.Parameter(torch.zeros(0)),
|
|
116
|
+
".scales": torch.nn.Parameter(torch.zeros(0)),
|
|
117
|
+
}
|
|
118
|
+
|
|
27
119
|
|
|
28
120
|
def adapt_fp_quant_config(config: FPQuantConfig):
|
|
29
121
|
if config.forward_dtype == "mxfp4":
|
|
@@ -76,7 +76,7 @@ GGUF_CONFIG_MAPPING = {
|
|
|
76
76
|
"attention.layer_norm_rms_epsilon": "rms_norm_eps",
|
|
77
77
|
"vocab_size": "vocab_size",
|
|
78
78
|
},
|
|
79
|
-
"
|
|
79
|
+
"qwen2_moe": {
|
|
80
80
|
"context_length": "max_position_embeddings",
|
|
81
81
|
"block_count": "num_hidden_layers",
|
|
82
82
|
"feed_forward_length": "intermediate_size",
|
|
@@ -313,6 +313,16 @@ GGUF_TOKENIZER_MAPPING = {
|
|
|
313
313
|
},
|
|
314
314
|
}
|
|
315
315
|
|
|
316
|
+
# We only need to set here the parameters that default to different values between transformers and llamacpp.
|
|
317
|
+
GGUF_CONFIG_DEFAULTS_MAPPING = {
|
|
318
|
+
"qwen3_moe": {
|
|
319
|
+
# NOTE: Qwen3MoeConfig defaults to false but llama.cpp needs this to be true.
|
|
320
|
+
# See: https://github.com/ggml-org/llama.cpp/blob/17f7f4baad8b3a716ee139da7bb56ae984e8c0fa/src/models/qwen3moe.cpp#L85-L96
|
|
321
|
+
# (the parameter right after LLM_FFN_SILU corresponds to norm_topk_prob)
|
|
322
|
+
"norm_topk_prob": True,
|
|
323
|
+
},
|
|
324
|
+
}
|
|
325
|
+
|
|
316
326
|
|
|
317
327
|
def _gguf_parse_value(_value, data_type):
|
|
318
328
|
if not isinstance(data_type, list):
|
|
@@ -15,17 +15,13 @@
|
|
|
15
15
|
|
|
16
16
|
from math import sqrt
|
|
17
17
|
|
|
18
|
-
from ..
|
|
19
|
-
|
|
20
|
-
is_hadamard_available,
|
|
21
|
-
is_torch_available,
|
|
22
|
-
)
|
|
18
|
+
from ..quantizers.quantizers_utils import should_convert_module
|
|
19
|
+
from ..utils import is_flute_available, is_hadamard_available, is_torch_available, logging
|
|
23
20
|
|
|
24
21
|
|
|
25
22
|
if is_torch_available():
|
|
26
23
|
import torch
|
|
27
|
-
|
|
28
|
-
|
|
24
|
+
import torch.nn as nn
|
|
29
25
|
|
|
30
26
|
if is_flute_available():
|
|
31
27
|
from flute.integrations.higgs import prepare_data_transposed
|
|
@@ -34,6 +30,8 @@ if is_flute_available():
|
|
|
34
30
|
if is_hadamard_available():
|
|
35
31
|
from fast_hadamard_transform import hadamard_transform
|
|
36
32
|
|
|
33
|
+
logger = logging.get_logger(__name__)
|
|
34
|
+
|
|
37
35
|
|
|
38
36
|
def pad_to_block(tensor, dims, had_block_size, value=0):
|
|
39
37
|
pad_dims = [0 for _ in range(2 * len(tensor.shape))]
|
|
@@ -549,70 +547,47 @@ class HiggsLinear(torch.nn.Module):
|
|
|
549
547
|
)
|
|
550
548
|
|
|
551
549
|
|
|
552
|
-
def replace_with_higgs_linear(
|
|
553
|
-
model,
|
|
554
|
-
quantization_config=None,
|
|
555
|
-
current_key_name=None,
|
|
556
|
-
has_been_replaced=False,
|
|
557
|
-
modules_to_not_convert=None,
|
|
558
|
-
):
|
|
550
|
+
def replace_with_higgs_linear(model, modules_to_not_convert: list[str] | None = None, quantization_config=None):
|
|
559
551
|
"""
|
|
560
|
-
Public method that
|
|
561
|
-
`accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
|
|
562
|
-
conversion has been successful or not.
|
|
552
|
+
Public method that replaces the Linear layers of the given model with HIGGS quantized layers.
|
|
563
553
|
|
|
564
554
|
Args:
|
|
565
555
|
model (`torch.nn.Module`):
|
|
566
556
|
The model to convert, can be any `torch.nn.Module` instance.
|
|
557
|
+
modules_to_not_convert (`list[str]`, *optional*, defaults to `None`):
|
|
558
|
+
A list of nn.Linear weights to not convert. If a parameter path is in the list (e.g. `lm_head.weight`), the corresponding module will not be
|
|
559
|
+
converted.
|
|
567
560
|
quantization_config (`HiggsConfig`):
|
|
568
561
|
The quantization config object that contains the quantization parameters.
|
|
569
|
-
current_key_name (`list`, *optional*):
|
|
570
|
-
A list that contains the current key name. This is used for recursion and should not be passed by the user.
|
|
571
|
-
has_been_replaced (`bool`, *optional*):
|
|
572
|
-
A boolean that indicates if the conversion has been successful or not. This is used for recursion and
|
|
573
|
-
should not be passed by the user.
|
|
574
562
|
"""
|
|
575
563
|
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
for
|
|
579
|
-
if
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
# Force requires grad to False to avoid unexpected errors
|
|
604
|
-
model._modules[name].requires_grad_(False)
|
|
605
|
-
if len(list(module.children())) > 0:
|
|
606
|
-
_, has_been_replaced = replace_with_higgs_linear(
|
|
607
|
-
module,
|
|
608
|
-
quantization_config=quantization_config,
|
|
609
|
-
current_key_name=current_key_name,
|
|
610
|
-
has_been_replaced=has_been_replaced,
|
|
611
|
-
modules_to_not_convert=modules_to_not_convert,
|
|
612
|
-
)
|
|
613
|
-
# Remove the last key for recursion
|
|
614
|
-
current_key_name.pop(-1)
|
|
615
|
-
return model, has_been_replaced
|
|
564
|
+
has_been_replaced = False
|
|
565
|
+
# we need this to correctly materialize the weights during quantization
|
|
566
|
+
for module_name, module in model.named_modules():
|
|
567
|
+
if not should_convert_module(module_name, modules_to_not_convert):
|
|
568
|
+
continue
|
|
569
|
+
with torch.device("meta"):
|
|
570
|
+
if isinstance(module, nn.Linear):
|
|
571
|
+
new_module = HiggsLinear(
|
|
572
|
+
module.in_features,
|
|
573
|
+
module.out_features,
|
|
574
|
+
bias=module.bias is not None,
|
|
575
|
+
num_bits=quantization_config.bits,
|
|
576
|
+
hadamard_size=quantization_config.hadamard_size,
|
|
577
|
+
group_size=quantization_config.group_size,
|
|
578
|
+
)
|
|
579
|
+
new_module.source_cls = type(module)
|
|
580
|
+
new_module.requires_grad_(False)
|
|
581
|
+
model.set_submodule(module_name, new_module)
|
|
582
|
+
has_been_replaced = True
|
|
583
|
+
|
|
584
|
+
if not has_been_replaced:
|
|
585
|
+
logger.warning(
|
|
586
|
+
"You are loading your model using eetq but no linear modules were found in your model."
|
|
587
|
+
" Please double check your model architecture, or submit an issue on github if you think this is"
|
|
588
|
+
" a bug."
|
|
589
|
+
)
|
|
590
|
+
return model
|
|
616
591
|
|
|
617
592
|
|
|
618
593
|
def dequantize_higgs(model, current_key_name=None):
|
|
@@ -11,11 +11,14 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import importlib.metadata
|
|
14
15
|
import os
|
|
15
16
|
import re
|
|
16
17
|
from collections.abc import Callable
|
|
17
18
|
from types import ModuleType
|
|
18
19
|
|
|
20
|
+
from packaging import version as pkg_version
|
|
21
|
+
|
|
19
22
|
from ..utils import ENV_VARS_TRUE_VALUES, logging
|
|
20
23
|
from ..utils.import_utils import is_kernels_available
|
|
21
24
|
from .flash_attention import flash_attention_forward
|
|
@@ -28,10 +31,12 @@ try:
|
|
|
28
31
|
Device,
|
|
29
32
|
LayerRepository,
|
|
30
33
|
Mode,
|
|
31
|
-
get_kernel,
|
|
32
34
|
register_kernel_mapping,
|
|
33
35
|
replace_kernel_forward_from_hub,
|
|
34
36
|
)
|
|
37
|
+
from kernels import (
|
|
38
|
+
get_kernel as get_kernel_hub,
|
|
39
|
+
)
|
|
35
40
|
from kernels import (
|
|
36
41
|
use_kernel_forward_from_hub as _kernels_use_kernel_forward_from_hub,
|
|
37
42
|
)
|
|
@@ -78,7 +83,7 @@ try:
|
|
|
78
83
|
)
|
|
79
84
|
return lambda func: func
|
|
80
85
|
|
|
81
|
-
_KERNEL_MAPPING: dict[str, dict[Device | str, LayerRepository]] = {
|
|
86
|
+
_KERNEL_MAPPING: dict[str, dict[Device | str, LayerRepository | dict[Mode, LayerRepository]]] = {
|
|
82
87
|
"MultiScaleDeformableAttention": {
|
|
83
88
|
"cuda": LayerRepository(
|
|
84
89
|
repo_id="kernels-community/deformable-detr",
|
|
@@ -111,6 +116,12 @@ try:
|
|
|
111
116
|
layer_name="RMSNorm",
|
|
112
117
|
)
|
|
113
118
|
},
|
|
119
|
+
"mps": {
|
|
120
|
+
Mode.INFERENCE: LayerRepository(
|
|
121
|
+
repo_id="kernels-community/mlx_rmsnorm",
|
|
122
|
+
layer_name="RMSNorm",
|
|
123
|
+
)
|
|
124
|
+
},
|
|
114
125
|
"npu": {
|
|
115
126
|
Mode.INFERENCE: LayerRepository(
|
|
116
127
|
repo_id="kernels-community/liger_kernels",
|
|
@@ -253,6 +264,8 @@ except ImportError:
|
|
|
253
264
|
|
|
254
265
|
_HUB_KERNEL_MAPPING: dict[str, dict[str, str]] = {
|
|
255
266
|
"causal-conv1d": {"repo_id": "kernels-community/causal-conv1d"},
|
|
267
|
+
"mamba-ssm": {"repo_id": "kernels-community/mamba-ssm", "revision": "v0.0.4"},
|
|
268
|
+
"falcon_mamba-ssm": {"repo_id": "kernels-community/mamba-ssm", "revision": "v0.0.4"},
|
|
256
269
|
}
|
|
257
270
|
|
|
258
271
|
_KERNEL_MODULE_MAPPING: dict[str, ModuleType | None] = {}
|
|
@@ -328,19 +341,21 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, ModuleType | None] = _
|
|
|
328
341
|
if kernel_name in mapping and isinstance(mapping[kernel_name], ModuleType):
|
|
329
342
|
return mapping[kernel_name]
|
|
330
343
|
if kernel_name not in _HUB_KERNEL_MAPPING:
|
|
331
|
-
logger.
|
|
344
|
+
logger.warning_once(f"Kernel {kernel_name} not found in _HUB_KERNEL_MAPPING")
|
|
332
345
|
mapping[kernel_name] = None
|
|
333
346
|
return None
|
|
334
347
|
if _kernels_available:
|
|
335
|
-
from kernels import get_kernel
|
|
336
|
-
|
|
337
348
|
try:
|
|
338
349
|
repo_id = _HUB_KERNEL_MAPPING[kernel_name]["repo_id"]
|
|
350
|
+
revision = _HUB_KERNEL_MAPPING[kernel_name].get("revision", None)
|
|
339
351
|
version = _HUB_KERNEL_MAPPING[kernel_name].get("version", None)
|
|
340
|
-
kernel = get_kernel(repo_id, version=version)
|
|
352
|
+
kernel = get_kernel(repo_id, revision=revision, version=version)
|
|
341
353
|
mapping[kernel_name] = kernel
|
|
342
354
|
except FileNotFoundError:
|
|
343
355
|
mapping[kernel_name] = None
|
|
356
|
+
except AssertionError:
|
|
357
|
+
# Happens when torch is built without an accelerator backend; fall back to slow path.
|
|
358
|
+
mapping[kernel_name] = None
|
|
344
359
|
|
|
345
360
|
else:
|
|
346
361
|
# Try to import is_{kernel_name}_available from ..utils
|
|
@@ -358,7 +373,7 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, ModuleType | None] = _
|
|
|
358
373
|
if callable(is_kernel_available) and is_kernel_available():
|
|
359
374
|
# Try to import the module "{kernel_name}" from parent package level
|
|
360
375
|
try:
|
|
361
|
-
module = importlib.import_module(f"{
|
|
376
|
+
module = importlib.import_module(f"{new_kernel_name}")
|
|
362
377
|
mapping[kernel_name] = module
|
|
363
378
|
return module
|
|
364
379
|
except Exception:
|
|
@@ -369,6 +384,46 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, ModuleType | None] = _
|
|
|
369
384
|
return mapping[kernel_name]
|
|
370
385
|
|
|
371
386
|
|
|
387
|
+
def get_kernel(kernel_name: str, revision: str | None = None, version: str | None = None) -> ModuleType:
|
|
388
|
+
from .. import __version__
|
|
389
|
+
|
|
390
|
+
user_agent = {"framework": "transformers", "version": __version__, "repo_id": kernel_name}
|
|
391
|
+
if _kernels_available:
|
|
392
|
+
kernels_version = importlib.metadata.version("kernels")
|
|
393
|
+
if pkg_version.parse(kernels_version) >= pkg_version.parse("0.10.4"):
|
|
394
|
+
return get_kernel_hub(kernel_name, revision=revision, version=version, user_agent=user_agent)
|
|
395
|
+
else:
|
|
396
|
+
return get_kernel_hub(kernel_name, revision=revision)
|
|
397
|
+
else:
|
|
398
|
+
raise ImportError("kernels is not installed, please install it with `pip install kernels`")
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def use_kernelized_func(module_names: list[Callable] | Callable):
|
|
402
|
+
"""
|
|
403
|
+
This decorator attaches the target function as an attribute of the module.
|
|
404
|
+
The function must already be decorated with @use_kernel_func_from_hub
|
|
405
|
+
this decorator then wraps it as an nn.Module internally.
|
|
406
|
+
When kernelize is later applied to the full model, the function can be accessed as a regular module attribute and kernelized just like any other layer.
|
|
407
|
+
The kernelization is performed in place, modifying the module directly.
|
|
408
|
+
"""
|
|
409
|
+
if isinstance(module_names, Callable):
|
|
410
|
+
module_names = [module_names]
|
|
411
|
+
|
|
412
|
+
def decorator(cls):
|
|
413
|
+
orig_init = cls.__init__
|
|
414
|
+
|
|
415
|
+
def new_init(self, *args, **kwargs):
|
|
416
|
+
orig_init(self, *args, **kwargs)
|
|
417
|
+
for fn in module_names:
|
|
418
|
+
# we hardcode the name of the function to "rotary_fn" for now
|
|
419
|
+
setattr(self, "rotary_fn", fn)
|
|
420
|
+
|
|
421
|
+
cls.__init__ = new_init
|
|
422
|
+
return cls
|
|
423
|
+
|
|
424
|
+
return decorator
|
|
425
|
+
|
|
426
|
+
|
|
372
427
|
__all__ = [
|
|
373
428
|
"LayerRepository",
|
|
374
429
|
"use_kernel_forward_from_hub",
|
|
@@ -377,4 +432,6 @@ __all__ = [
|
|
|
377
432
|
"register_kernel_mapping_transformers",
|
|
378
433
|
"replace_kernel_forward_from_hub",
|
|
379
434
|
"lazy_load_kernel",
|
|
380
|
-
|
|
435
|
+
"get_kernel",
|
|
436
|
+
"use_kernelized_func",
|
|
437
|
+
] # type: ignore
|
|
@@ -26,6 +26,7 @@ import re
|
|
|
26
26
|
import shutil
|
|
27
27
|
import sys
|
|
28
28
|
import tempfile
|
|
29
|
+
import warnings
|
|
29
30
|
from dataclasses import fields
|
|
30
31
|
from enum import Enum
|
|
31
32
|
from pathlib import Path
|
|
@@ -939,6 +940,8 @@ class TrackioCallback(TrainerCallback):
|
|
|
939
940
|
```
|
|
940
941
|
"""
|
|
941
942
|
|
|
943
|
+
SPACE_URL = "https://huggingface.co/spaces/{space_id}"
|
|
944
|
+
|
|
942
945
|
def __init__(self):
|
|
943
946
|
has_trackio = is_trackio_available()
|
|
944
947
|
if not has_trackio:
|
|
@@ -1057,6 +1060,39 @@ class TrackioCallback(TrainerCallback):
|
|
|
1057
1060
|
metrics = rewrite_logs(metrics)
|
|
1058
1061
|
self._trackio.log(metrics)
|
|
1059
1062
|
|
|
1063
|
+
def on_push_begin(self, args, state, control, model, **kwargs):
|
|
1064
|
+
if not state.is_world_process_zero or self._trackio is None:
|
|
1065
|
+
return
|
|
1066
|
+
if (current_project := self._trackio.context_vars.current_project.get()) is None:
|
|
1067
|
+
return
|
|
1068
|
+
trackio_version = packaging.version.parse(self._trackio.__version__)
|
|
1069
|
+
if trackio_version < packaging.version.parse("0.13.0"):
|
|
1070
|
+
warnings.warn(
|
|
1071
|
+
"The version of `trackio` that is installed is <=0.13.0, so "
|
|
1072
|
+
"the local Trackio project will not be pushed to Hugging Face. Run "
|
|
1073
|
+
"`pip install --upgrade trackio` to fix this."
|
|
1074
|
+
)
|
|
1075
|
+
return
|
|
1076
|
+
|
|
1077
|
+
space_id = self._trackio.context_vars.current_space_id.get()
|
|
1078
|
+
if space_id is None:
|
|
1079
|
+
space_id = self._trackio.sync(current_project, force=True)
|
|
1080
|
+
space_url = self.SPACE_URL.format(space_id=space_id)
|
|
1081
|
+
|
|
1082
|
+
badge_markdown = (
|
|
1083
|
+
f'<a href="{space_url}" target="_blank"><img src="https://raw.githubusercontent.com/gradio-app/trackio/refs/heads/main/trackio/assets/badge.png" alt="Visualize in Trackio"'
|
|
1084
|
+
' title="Visualize in Trackio" style="height: 40px;"/></a>'
|
|
1085
|
+
)
|
|
1086
|
+
if badge_markdown not in modelcard.AUTOGENERATED_TRAINER_COMMENT:
|
|
1087
|
+
modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}"
|
|
1088
|
+
|
|
1089
|
+
trackio_tags = ["trackio", f"trackio:{space_url}"]
|
|
1090
|
+
if getattr(model, "model_tags", None) is not None:
|
|
1091
|
+
if "trackio" not in model.model_tags:
|
|
1092
|
+
model.model_tags.extend(trackio_tags)
|
|
1093
|
+
else:
|
|
1094
|
+
model.model_tags = trackio_tags
|
|
1095
|
+
|
|
1060
1096
|
|
|
1061
1097
|
class CometCallback(TrainerCallback):
|
|
1062
1098
|
"""
|
|
@@ -1455,6 +1491,10 @@ class NeptuneMissingConfiguration(Exception):
|
|
|
1455
1491
|
class NeptuneCallback(TrainerCallback):
|
|
1456
1492
|
"""TrainerCallback that sends the logs to [Neptune](https://app.neptune.ai).
|
|
1457
1493
|
|
|
1494
|
+
> [!WARNING]
|
|
1495
|
+
> Neptune integration is deprecated and will be removed in a future version of Transformers. We recommend using
|
|
1496
|
+
> other supported experiment tracking integrations.
|
|
1497
|
+
|
|
1458
1498
|
Args:
|
|
1459
1499
|
api_token (`str`, *optional*): Neptune API token obtained upon registration.
|
|
1460
1500
|
You can leave this argument out if you have saved your token to the `NEPTUNE_API_TOKEN` environment
|
|
@@ -1500,6 +1540,11 @@ class NeptuneCallback(TrainerCallback):
|
|
|
1500
1540
|
log_checkpoints: str | None = None,
|
|
1501
1541
|
**neptune_run_kwargs,
|
|
1502
1542
|
):
|
|
1543
|
+
warnings.warn(
|
|
1544
|
+
"The NeptuneCallback is deprecated and will be removed in a future version of Transformers. We recommend "
|
|
1545
|
+
"using other supported experiment tracking integrations.",
|
|
1546
|
+
FutureWarning,
|
|
1547
|
+
)
|
|
1503
1548
|
if not is_neptune_available():
|
|
1504
1549
|
raise ValueError(
|
|
1505
1550
|
"NeptuneCallback requires the Neptune client library to be installed. "
|
|
@@ -77,6 +77,7 @@ def convert_tekken_tokenizer(tokenizer_file: str):
|
|
|
77
77
|
"""Convert a "tekken" tokenizer to a fast Tokenizer."""
|
|
78
78
|
# Tekken format -- need to use the Converter
|
|
79
79
|
|
|
80
|
+
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
|
80
81
|
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
|
81
82
|
|
|
82
83
|
# Load directly using their lib
|
|
@@ -106,4 +107,15 @@ def convert_tekken_tokenizer(tokenizer_file: str):
|
|
|
106
107
|
# Post-process
|
|
107
108
|
tokenizer.add_special_tokens({"additional_special_tokens": all_special})
|
|
108
109
|
|
|
110
|
+
MAP_SPECAL = {
|
|
111
|
+
"bos_token": SpecialTokens.bos.value,
|
|
112
|
+
"eos_token": SpecialTokens.eos.value,
|
|
113
|
+
"pad_token": SpecialTokens.pad.value,
|
|
114
|
+
"unk_token": SpecialTokens.unk.value,
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
for special_key, special_token in MAP_SPECAL.items():
|
|
118
|
+
if special_token in all_special:
|
|
119
|
+
tokenizer.add_special_tokens({special_key: special_token})
|
|
120
|
+
|
|
109
121
|
return tokenizer
|