transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
2
|
+
# This file was automatically generated from src/transformers/models/glmasr/modular_glmasr.py.
|
|
3
|
+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
|
4
|
+
# the file from the modular. If any change should be done, please apply the change to the
|
|
5
|
+
# modular_glmasr.py file directly. One of our CI enforces this.
|
|
6
|
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
|
7
|
+
# coding=utf-8
|
|
8
|
+
# Copyright 2025 the HuggingFace Team. All rights reserved.
|
|
9
|
+
#
|
|
10
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
11
|
+
# you may not use this file except in compliance with the License.
|
|
12
|
+
# You may obtain a copy of the License at
|
|
13
|
+
#
|
|
14
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
15
|
+
#
|
|
16
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
17
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
18
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
19
|
+
# See the License for the specific language governing permissions and
|
|
20
|
+
# limitations under the License.
|
|
21
|
+
|
|
22
|
+
import re
|
|
23
|
+
from typing import Optional, Union
|
|
24
|
+
|
|
25
|
+
import numpy as np
|
|
26
|
+
|
|
27
|
+
from ...audio_utils import AudioInput, make_list_of_audio
|
|
28
|
+
from ...feature_extraction_utils import BatchFeature
|
|
29
|
+
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
|
30
|
+
from ...tokenization_utils_base import TextInput
|
|
31
|
+
from ...utils import is_torch_available, logging
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
if is_torch_available():
|
|
35
|
+
import torch
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
logger = logging.get_logger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class GlmAsrProcessorKwargs(ProcessingKwargs, total=False):
|
|
42
|
+
_defaults = {
|
|
43
|
+
"text_kwargs": {
|
|
44
|
+
"padding": True,
|
|
45
|
+
},
|
|
46
|
+
"audio_kwargs": {
|
|
47
|
+
"sampling_rate": 16000,
|
|
48
|
+
"chunk_length": 30.0,
|
|
49
|
+
"return_attention_mask": True,
|
|
50
|
+
"padding": "max_length",
|
|
51
|
+
},
|
|
52
|
+
"common_kwargs": {
|
|
53
|
+
"return_tensors": "pt",
|
|
54
|
+
"padding_side": "left",
|
|
55
|
+
},
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class GlmAsrProcessor(ProcessorMixin):
|
|
60
|
+
r"""
|
|
61
|
+
Constructs an GlmAsr processor which wraps an GlmAsr feature extractor and an GlmAsr
|
|
62
|
+
tokenizer into a single processor.
|
|
63
|
+
|
|
64
|
+
[`GlmAsrProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
|
|
65
|
+
[`Qwen2TokenizerFast`]. See the [`~GlmAsrProcessor.__call__`] for more information.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
feature_extractor ([`WhisperFeatureExtractor`]):
|
|
69
|
+
The feature extractor is a required input.
|
|
70
|
+
tokenizer ([`Qwen2TokenizerFast`]):
|
|
71
|
+
The tokenizer is a required input.
|
|
72
|
+
chat_template (`Optional[str]`, *optional*):
|
|
73
|
+
The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
|
|
74
|
+
template will be used.
|
|
75
|
+
audio_token (`Optional[str]`, *optional*, defaults to `"<|pad|>`"):
|
|
76
|
+
Special token used to represent audio inputs in the chat template.
|
|
77
|
+
default_transcription_prompt (`str`, *optional*, defaults to `"Please transcribe this audio into text"`):
|
|
78
|
+
Default prompt to use for transcription tasks when applying transcription requests.
|
|
79
|
+
max_audio_len (`int`, *optional*, defaults to 655):
|
|
80
|
+
Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
|
|
81
|
+
655 gives approximately 8192 tokens, corresponding to the maximum sequence length of the text model.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
feature_extractor,
|
|
87
|
+
tokenizer,
|
|
88
|
+
chat_template=None,
|
|
89
|
+
audio_token="<|pad|>",
|
|
90
|
+
default_transcription_prompt="Please transcribe this audio into text",
|
|
91
|
+
max_audio_len=655,
|
|
92
|
+
):
|
|
93
|
+
self.audio_token = audio_token
|
|
94
|
+
self.audio_token_id = tokenizer.convert_tokens_to_ids(audio_token)
|
|
95
|
+
self.default_transcription_prompt = default_transcription_prompt
|
|
96
|
+
self.max_audio_len = max_audio_len
|
|
97
|
+
super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
|
|
98
|
+
|
|
99
|
+
def _get_audio_token_length(self, audio_lengths: "torch.Tensor") -> "torch.Tensor":
|
|
100
|
+
merge_factor = 4
|
|
101
|
+
for padding, kernel_size, stride in [(1, 3, 1), (1, 3, 2)]:
|
|
102
|
+
audio_lengths = (audio_lengths + 2 * padding - (kernel_size - 1) - 1) // stride + 1
|
|
103
|
+
|
|
104
|
+
num_tokens = (audio_lengths - merge_factor) // merge_factor + 1
|
|
105
|
+
return num_tokens
|
|
106
|
+
|
|
107
|
+
def __call__(
|
|
108
|
+
self,
|
|
109
|
+
text: Union[TextInput, list[TextInput]],
|
|
110
|
+
audio: Optional[AudioInput] = None,
|
|
111
|
+
output_labels: Optional[bool] = False,
|
|
112
|
+
**kwargs: Unpack[GlmAsrProcessorKwargs],
|
|
113
|
+
) -> BatchFeature:
|
|
114
|
+
r"""
|
|
115
|
+
Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
|
|
116
|
+
method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
|
|
117
|
+
audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
|
|
118
|
+
with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
|
|
119
|
+
the text is tokenized as-is (LM-only behavior).
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
text (`str` or `list[str]`):
|
|
123
|
+
Input sequence or batch of sequences.
|
|
124
|
+
audio (`np.ndarray` or `list[np.ndarray]`):
|
|
125
|
+
Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
|
|
126
|
+
`audio` inputs.
|
|
127
|
+
output_labels (bool, *optional*, default=False):
|
|
128
|
+
Whether to return labels for training.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
[`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
|
|
132
|
+
audio features (`input_features`, `input_features_mask`).
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
# Merge defaults with user kwargs
|
|
136
|
+
call_kwargs = self._merge_kwargs(
|
|
137
|
+
GlmAsrProcessorKwargs,
|
|
138
|
+
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
|
139
|
+
**kwargs,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
text_kwargs = call_kwargs["text_kwargs"]
|
|
143
|
+
audio_kwargs = call_kwargs["audio_kwargs"]
|
|
144
|
+
return_tensors = text_kwargs.get("return_tensors")
|
|
145
|
+
if return_tensors != "pt":
|
|
146
|
+
raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
|
|
147
|
+
|
|
148
|
+
if isinstance(text, str):
|
|
149
|
+
text = [text]
|
|
150
|
+
elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
|
|
151
|
+
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
|
152
|
+
|
|
153
|
+
audio_inputs = {}
|
|
154
|
+
if audio is not None:
|
|
155
|
+
audio = make_list_of_audio(audio)
|
|
156
|
+
if len(text) != len(audio):
|
|
157
|
+
raise ValueError(f"Got {len(text)} text but {len(audio)} audios; they must match 1:1.")
|
|
158
|
+
|
|
159
|
+
# Determine number of chunks per sample, and flatten
|
|
160
|
+
window_size = int(audio_kwargs["sampling_rate"] * audio_kwargs["chunk_length"])
|
|
161
|
+
max_windows = int(self.max_audio_len // audio_kwargs["chunk_length"])
|
|
162
|
+
|
|
163
|
+
per_sample_windows: list[int] = []
|
|
164
|
+
flat_chunks: list[np.ndarray] = []
|
|
165
|
+
|
|
166
|
+
for audio_el in audio:
|
|
167
|
+
n_samples = int(audio_el.shape[0])
|
|
168
|
+
n_win = max(1, (n_samples + window_size - 1) // window_size)
|
|
169
|
+
if n_win > max_windows:
|
|
170
|
+
logger.warning(
|
|
171
|
+
f"Audio duration ({n_samples / audio_kwargs['sampling_rate']:.1f}s) exceeds {self.max_audio_len}s; truncating to first {self.max_audio_len}s."
|
|
172
|
+
)
|
|
173
|
+
n_win = max_windows
|
|
174
|
+
per_sample_windows.append(n_win)
|
|
175
|
+
|
|
176
|
+
time_cap = min(n_samples, n_win * window_size)
|
|
177
|
+
for i in range(n_win):
|
|
178
|
+
start = i * window_size
|
|
179
|
+
end = min((i + 1) * window_size, time_cap)
|
|
180
|
+
flat_chunks.append(audio_el[start:end])
|
|
181
|
+
|
|
182
|
+
# Feature extraction
|
|
183
|
+
audio_inputs = self.feature_extractor(flat_chunks, **audio_kwargs)
|
|
184
|
+
padding_mask = audio_inputs.pop("attention_mask")
|
|
185
|
+
audio_inputs["input_features_mask"] = padding_mask
|
|
186
|
+
|
|
187
|
+
# Compute sequence lengths token counting
|
|
188
|
+
audio_lengths = torch.stack([s.sum() for s in torch.split(padding_mask.sum(-1), per_sample_windows)])
|
|
189
|
+
audio_tokens_lengths = self._get_audio_token_length(audio_lengths)
|
|
190
|
+
|
|
191
|
+
# expand audio tokens in text
|
|
192
|
+
for i, audio_length in enumerate(audio_tokens_lengths):
|
|
193
|
+
expanded = re.sub(re.escape(self.audio_token), self.audio_token * audio_length, text[i])
|
|
194
|
+
text[i] = expanded
|
|
195
|
+
|
|
196
|
+
# Tokenize
|
|
197
|
+
text_inputs = self.tokenizer(text, **text_kwargs)
|
|
198
|
+
|
|
199
|
+
data = {**text_inputs, **audio_inputs}
|
|
200
|
+
if output_labels:
|
|
201
|
+
labels = data["input_ids"].clone()
|
|
202
|
+
labels[labels == self.audio_token_id] = -100
|
|
203
|
+
labels[labels == self.tokenizer.pad_token_id] = -100
|
|
204
|
+
data["labels"] = labels
|
|
205
|
+
|
|
206
|
+
return BatchFeature(data=data, tensor_type=return_tensors)
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def model_input_names(self) -> list[str]:
|
|
210
|
+
tok_names = self.tokenizer.model_input_names
|
|
211
|
+
fea_names = self.feature_extractor.model_input_names
|
|
212
|
+
return list(dict.fromkeys(tok_names + fea_names + ["input_features_mask"]))
|
|
213
|
+
|
|
214
|
+
def apply_transcription_request(
|
|
215
|
+
self,
|
|
216
|
+
audio: Union[str, list[str], AudioInput],
|
|
217
|
+
prompt: Optional[Union[str, list[str]]] = None,
|
|
218
|
+
**kwargs: Unpack[GlmAsrProcessorKwargs],
|
|
219
|
+
) -> BatchFeature:
|
|
220
|
+
"""
|
|
221
|
+
Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
|
225
|
+
Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
|
|
226
|
+
the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
|
|
227
|
+
prompt (`str` or `list[str]`, *optional*):
|
|
228
|
+
Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
|
|
229
|
+
each sample uses `"Transcribe the input speech."`.
|
|
230
|
+
**kwargs:
|
|
231
|
+
Additional keyword arguments forwarded to [`~AudioFlamingo3Processor.apply_chat_template`] (for example
|
|
232
|
+
`text_kwargs`, `audio_kwargs`, ...).
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
[`BatchFeature`]: Processor outputs ready to be passed to [`AudioFlamingo3ForConditionalGeneration.generate`].
|
|
236
|
+
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
if isinstance(audio, str):
|
|
240
|
+
audio_items: list[Union[str, np.ndarray]] = [audio]
|
|
241
|
+
elif isinstance(audio, (list, tuple)) and audio and all(isinstance(el, str) for el in audio):
|
|
242
|
+
audio_items = list(audio)
|
|
243
|
+
else:
|
|
244
|
+
audio_items = list(make_list_of_audio(audio))
|
|
245
|
+
if is_torch_available():
|
|
246
|
+
audio_items = [el.detach().cpu().numpy() if isinstance(el, torch.Tensor) else el for el in audio_items]
|
|
247
|
+
|
|
248
|
+
batch_size = len(audio_items)
|
|
249
|
+
if batch_size == 0:
|
|
250
|
+
raise ValueError("`audio` must contain at least one sample.")
|
|
251
|
+
|
|
252
|
+
if prompt is None:
|
|
253
|
+
prompts = [self.default_transcription_prompt] * batch_size
|
|
254
|
+
elif isinstance(prompt, str):
|
|
255
|
+
prompts = [prompt] * batch_size
|
|
256
|
+
elif isinstance(prompt, (list, tuple)):
|
|
257
|
+
if len(prompt) != batch_size:
|
|
258
|
+
raise ValueError(
|
|
259
|
+
f"Received {len(prompt)} prompt(s) for {batch_size} audio sample(s); counts must match."
|
|
260
|
+
)
|
|
261
|
+
prompts = []
|
|
262
|
+
for item in prompt:
|
|
263
|
+
if item is None:
|
|
264
|
+
prompts.append(self.default_transcription_prompt)
|
|
265
|
+
elif isinstance(item, str):
|
|
266
|
+
prompts.append(item)
|
|
267
|
+
else:
|
|
268
|
+
raise TypeError("Each prompt must be a string or `None`.")
|
|
269
|
+
else:
|
|
270
|
+
raise TypeError("`prompt` must be a string, a sequence of strings, or `None`.")
|
|
271
|
+
|
|
272
|
+
conversations = [
|
|
273
|
+
[
|
|
274
|
+
{
|
|
275
|
+
"role": "user",
|
|
276
|
+
"content": [
|
|
277
|
+
{"type": "audio", "path": audio_item}
|
|
278
|
+
if isinstance(audio_item, str)
|
|
279
|
+
else {"type": "audio", "audio": audio_item},
|
|
280
|
+
{"type": "text", "text": prompt_text},
|
|
281
|
+
],
|
|
282
|
+
}
|
|
283
|
+
]
|
|
284
|
+
for prompt_text, audio_item in zip(prompts, audio_items)
|
|
285
|
+
]
|
|
286
|
+
|
|
287
|
+
return self.apply_chat_template(
|
|
288
|
+
conversations,
|
|
289
|
+
tokenize=True,
|
|
290
|
+
add_generation_prompt=True,
|
|
291
|
+
return_dict=True,
|
|
292
|
+
**kwargs,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
def batch_decode(self, *args, strip_prefix=False, **kwargs):
|
|
296
|
+
"""
|
|
297
|
+
Forward arguments to [`~PreTrainedTokenizer.batch_decode`] and optionally remove the assistant framing the model
|
|
298
|
+
was trained to produce.
|
|
299
|
+
|
|
300
|
+
AF3 transcription requests respond with sentences such as `"The spoken content of the audio is \"...\"."`.
|
|
301
|
+
Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
|
|
302
|
+
"""
|
|
303
|
+
decoded = self.tokenizer.batch_decode(*args, **kwargs)
|
|
304
|
+
if strip_prefix:
|
|
305
|
+
decoded = [self._strip_assistant_prefix_and_quotes(text) for text in decoded]
|
|
306
|
+
return decoded
|
|
307
|
+
|
|
308
|
+
def _strip_assistant_prefix_and_quotes(self, text: str) -> str:
|
|
309
|
+
"""
|
|
310
|
+
Remove the assistant prefix and surrounding quotes from a decoded transcription string.
|
|
311
|
+
"""
|
|
312
|
+
|
|
313
|
+
stripped = text.strip()
|
|
314
|
+
|
|
315
|
+
for prefix in (
|
|
316
|
+
"The spoken content of the audio is",
|
|
317
|
+
"The transcription of the audio is",
|
|
318
|
+
):
|
|
319
|
+
if stripped.startswith(prefix):
|
|
320
|
+
stripped = stripped[len(prefix) :].strip()
|
|
321
|
+
break
|
|
322
|
+
|
|
323
|
+
if stripped.endswith("."):
|
|
324
|
+
stripped = stripped[:-1].strip()
|
|
325
|
+
|
|
326
|
+
if len(stripped) >= 2 and stripped[0] == stripped[-1] and stripped[0] in {"'", '"'}:
|
|
327
|
+
stripped = stripped[1:-1].strip()
|
|
328
|
+
|
|
329
|
+
return stripped
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
__all__ = ["GlmAsrProcessor"]
|
|
@@ -107,7 +107,6 @@ class GLPNImageProcessorFast(BaseImageProcessorFast):
|
|
|
107
107
|
processed_groups[shape] = stacked_images
|
|
108
108
|
|
|
109
109
|
processed_images = reorder_images(processed_groups, grouped_index)
|
|
110
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
111
110
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
112
111
|
|
|
113
112
|
def post_process_depth_estimation(self, outputs, target_sizes=None):
|
|
@@ -411,6 +411,7 @@ class GLPNModel(GLPNPreTrainedModel):
|
|
|
411
411
|
output_attentions: Optional[bool] = None,
|
|
412
412
|
output_hidden_states: Optional[bool] = None,
|
|
413
413
|
return_dict: Optional[bool] = None,
|
|
414
|
+
**kwargs,
|
|
414
415
|
) -> Union[tuple, BaseModelOutput]:
|
|
415
416
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
416
417
|
output_hidden_states = (
|
|
@@ -597,6 +598,7 @@ class GLPNForDepthEstimation(GLPNPreTrainedModel):
|
|
|
597
598
|
output_attentions: Optional[bool] = None,
|
|
598
599
|
output_hidden_states: Optional[bool] = None,
|
|
599
600
|
return_dict: Optional[bool] = None,
|
|
601
|
+
**kwargs,
|
|
600
602
|
) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]:
|
|
601
603
|
r"""
|
|
602
604
|
labels (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*):
|
|
@@ -189,7 +189,6 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
|
|
|
189
189
|
processed_images_grouped[shape] = stacked_images
|
|
190
190
|
|
|
191
191
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
192
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
193
192
|
|
|
194
193
|
return BatchFeature(
|
|
195
194
|
data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
|
|
@@ -433,6 +433,7 @@ class GotOcr2VisionEncoder(GotOcr2PreTrainedModel):
|
|
|
433
433
|
self.neck = GotOcr2VisionNeck(config)
|
|
434
434
|
|
|
435
435
|
self.gradient_checkpointing = False
|
|
436
|
+
self.post_init()
|
|
436
437
|
|
|
437
438
|
def get_input_embeddings(self):
|
|
438
439
|
return self.patch_embed
|
|
@@ -796,6 +797,7 @@ class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin):
|
|
|
796
797
|
attention_mask=None,
|
|
797
798
|
cache_position=None,
|
|
798
799
|
logits_to_keep=None,
|
|
800
|
+
is_first_iteration=False,
|
|
799
801
|
**kwargs,
|
|
800
802
|
):
|
|
801
803
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -807,12 +809,15 @@ class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin):
|
|
|
807
809
|
attention_mask=attention_mask,
|
|
808
810
|
cache_position=cache_position,
|
|
809
811
|
logits_to_keep=logits_to_keep,
|
|
812
|
+
is_first_iteration=is_first_iteration,
|
|
810
813
|
**kwargs,
|
|
811
814
|
)
|
|
812
815
|
|
|
813
|
-
if
|
|
814
|
-
#
|
|
815
|
-
#
|
|
816
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
817
|
+
# Pixel values are used only in the first iteration if available
|
|
818
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
819
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
820
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
816
821
|
model_inputs["pixel_values"] = pixel_values
|
|
817
822
|
|
|
818
823
|
return model_inputs
|
|
@@ -45,6 +45,7 @@ from ...utils import (
|
|
|
45
45
|
auto_docstring,
|
|
46
46
|
logging,
|
|
47
47
|
)
|
|
48
|
+
from ...utils.generic import maybe_autocast
|
|
48
49
|
from .configuration_gpt2 import GPT2Config
|
|
49
50
|
|
|
50
51
|
|
|
@@ -102,7 +103,6 @@ class GPT2Attention(nn.Module):
|
|
|
102
103
|
),
|
|
103
104
|
persistent=False,
|
|
104
105
|
)
|
|
105
|
-
self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
|
|
106
106
|
|
|
107
107
|
self.embed_dim = config.hidden_size
|
|
108
108
|
self.num_heads = config.num_attention_heads
|
|
@@ -150,7 +150,7 @@ class GPT2Attention(nn.Module):
|
|
|
150
150
|
scale_factor /= float(self.layer_idx + 1)
|
|
151
151
|
|
|
152
152
|
# Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
|
|
153
|
-
with
|
|
153
|
+
with maybe_autocast(query.device.type, enabled=False):
|
|
154
154
|
q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
|
|
155
155
|
attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
|
|
156
156
|
attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
|
|
@@ -475,12 +475,8 @@ class GPT2PreTrainedModel(PreTrainedModel):
|
|
|
475
475
|
_supports_flash_attn = True
|
|
476
476
|
_supports_sdpa = True
|
|
477
477
|
_supports_attention_backend = True
|
|
478
|
-
|
|
479
478
|
_can_compile_fullgraph = True
|
|
480
479
|
|
|
481
|
-
def __init__(self, *inputs, **kwargs):
|
|
482
|
-
super().__init__(*inputs, **kwargs)
|
|
483
|
-
|
|
484
480
|
@torch.no_grad()
|
|
485
481
|
def _init_weights(self, module):
|
|
486
482
|
"""Initialize the weights."""
|
|
@@ -496,6 +492,14 @@ class GPT2PreTrainedModel(PreTrainedModel):
|
|
|
496
492
|
elif isinstance(module, nn.LayerNorm):
|
|
497
493
|
init.zeros_(module.bias)
|
|
498
494
|
init.ones_(module.weight)
|
|
495
|
+
elif isinstance(module, GPT2Attention):
|
|
496
|
+
max_positions = module.config.max_position_embeddings
|
|
497
|
+
init.copy_(
|
|
498
|
+
module.bias,
|
|
499
|
+
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
|
|
500
|
+
1, 1, max_positions, max_positions
|
|
501
|
+
),
|
|
502
|
+
)
|
|
499
503
|
|
|
500
504
|
# Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
|
|
501
505
|
# > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
|
|
@@ -1021,6 +1025,7 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
|
|
|
1021
1025
|
output_attentions: Optional[bool] = None,
|
|
1022
1026
|
output_hidden_states: Optional[bool] = None,
|
|
1023
1027
|
return_dict: Optional[bool] = None,
|
|
1028
|
+
**kwargs,
|
|
1024
1029
|
) -> Union[tuple, SequenceClassifierOutputWithPast]:
|
|
1025
1030
|
r"""
|
|
1026
1031
|
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
|
|
@@ -1148,6 +1153,7 @@ class GPT2ForTokenClassification(GPT2PreTrainedModel):
|
|
|
1148
1153
|
output_attentions: Optional[bool] = None,
|
|
1149
1154
|
output_hidden_states: Optional[bool] = None,
|
|
1150
1155
|
return_dict: Optional[bool] = None,
|
|
1156
|
+
**kwargs,
|
|
1151
1157
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1152
1158
|
r"""
|
|
1153
1159
|
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
|
|
@@ -1228,6 +1234,7 @@ class GPT2ForQuestionAnswering(GPT2PreTrainedModel):
|
|
|
1228
1234
|
output_attentions: Optional[bool] = None,
|
|
1229
1235
|
output_hidden_states: Optional[bool] = None,
|
|
1230
1236
|
return_dict: Optional[bool] = None,
|
|
1237
|
+
**kwargs,
|
|
1231
1238
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
1232
1239
|
r"""
|
|
1233
1240
|
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
|
|
@@ -14,12 +14,12 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""Tokenization classes for OpenAI GPT."""
|
|
16
16
|
|
|
17
|
-
from typing import Optional
|
|
17
|
+
from typing import Optional, Union
|
|
18
18
|
|
|
19
19
|
from tokenizers import Tokenizer, decoders, pre_tokenizers
|
|
20
20
|
from tokenizers.models import BPE
|
|
21
21
|
|
|
22
|
-
from ...tokenization_utils_tokenizers import TokenizersBackend
|
|
22
|
+
from ...tokenization_utils_tokenizers import AddedToken, TokenizersBackend
|
|
23
23
|
from ...utils import logging
|
|
24
24
|
|
|
25
25
|
|
|
@@ -84,45 +84,31 @@ class GPT2Tokenizer(TokenizersBackend):
|
|
|
84
84
|
add_bos_token (`bool`, *optional*, defaults to `False`):
|
|
85
85
|
Whether or not to add an initial beginning of sentence token to the input. This allows to treat the leading
|
|
86
86
|
word just as any other word.
|
|
87
|
-
vocab (`dict`, *optional*):
|
|
88
|
-
Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file
|
|
89
|
-
merges (`list`, *optional*):
|
|
90
|
-
Custom merges list. If not provided, merges are loaded from merges_file
|
|
87
|
+
vocab (`str` or `dict[str, int]`, *optional*):
|
|
88
|
+
Custom vocabulary dictionary. If not provided, vocabulary is loaded from `vocab_file`.
|
|
89
|
+
merges (`str` or `list[str]`, *optional*):
|
|
90
|
+
Custom merges list. If not provided, merges are loaded from `merges_file`.
|
|
91
91
|
"""
|
|
92
92
|
|
|
93
93
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
94
94
|
model_input_names = ["input_ids", "attention_mask"]
|
|
95
|
-
|
|
95
|
+
model = BPE
|
|
96
96
|
|
|
97
97
|
def __init__(
|
|
98
98
|
self,
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
99
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
100
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
101
|
+
errors: str = "replace",
|
|
102
|
+
unk_token: Union[AddedToken, str] = "<|endoftext|>",
|
|
103
|
+
bos_token: Union[AddedToken, str] = "<|endoftext|>",
|
|
104
|
+
eos_token: Union[AddedToken, str] = "<|endoftext|>",
|
|
105
|
+
pad_token: Optional[Union[AddedToken, str]] = None,
|
|
104
106
|
add_prefix_space=False,
|
|
105
|
-
add_bos_token=False,
|
|
106
|
-
vocab: Optional[dict] = None,
|
|
107
|
-
merges: Optional[list] = None,
|
|
108
107
|
**kwargs,
|
|
109
108
|
):
|
|
110
|
-
# self.add_bos_token = add_bos_token
|
|
111
|
-
|
|
112
109
|
self.add_prefix_space = add_prefix_space
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
self._vocab = (
|
|
116
|
-
{token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
|
|
117
|
-
)
|
|
118
|
-
else:
|
|
119
|
-
self._vocab = {}
|
|
120
|
-
|
|
121
|
-
if merges is not None:
|
|
122
|
-
self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
|
|
123
|
-
else:
|
|
124
|
-
self._merges = []
|
|
125
|
-
|
|
110
|
+
self._vocab = vocab if vocab is not None else {}
|
|
111
|
+
self._merges = merges or []
|
|
126
112
|
self._tokenizer = Tokenizer(
|
|
127
113
|
BPE(
|
|
128
114
|
vocab=self._vocab,
|
|
@@ -133,31 +119,17 @@ class GPT2Tokenizer(TokenizersBackend):
|
|
|
133
119
|
fuse_unk=False,
|
|
134
120
|
)
|
|
135
121
|
)
|
|
136
|
-
|
|
137
122
|
self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
|
|
138
123
|
self._tokenizer.decoder = decoders.ByteLevel()
|
|
139
|
-
|
|
140
|
-
tokenizer_object = self._tokenizer
|
|
141
|
-
|
|
142
|
-
# Set these before calling super().__init__() so the base class _post_init() can use them
|
|
143
|
-
self._add_bos_token = add_bos_token
|
|
144
|
-
self._add_eos_token = False
|
|
145
|
-
|
|
146
124
|
super().__init__(
|
|
147
|
-
tokenizer_object=tokenizer_object,
|
|
148
125
|
errors=errors,
|
|
149
126
|
unk_token=unk_token,
|
|
150
127
|
bos_token=bos_token,
|
|
151
128
|
eos_token=eos_token,
|
|
152
129
|
pad_token=pad_token,
|
|
153
130
|
add_prefix_space=add_prefix_space,
|
|
154
|
-
add_bos_token=add_bos_token,
|
|
155
131
|
**kwargs,
|
|
156
132
|
)
|
|
157
133
|
|
|
158
|
-
# Call _post_init for tokenizers created directly (not from_pretrained)
|
|
159
|
-
# For from_pretrained, this will be called again after loading the tokenizer from file
|
|
160
|
-
self._post_init()
|
|
161
|
-
|
|
162
134
|
|
|
163
135
|
__all__ = ["GPT2Tokenizer"]
|
|
@@ -26,7 +26,6 @@ from ...activations import ACT2FN
|
|
|
26
26
|
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
|
27
27
|
from ...generation import GenerationMixin
|
|
28
28
|
from ...masking_utils import create_causal_mask
|
|
29
|
-
from ...modeling_flash_attention_utils import is_flash_attn_available
|
|
30
29
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
31
30
|
from ...modeling_outputs import (
|
|
32
31
|
BaseModelOutputWithPastAndCrossAttentions,
|
|
@@ -43,10 +42,6 @@ from ...utils import (
|
|
|
43
42
|
from .configuration_gpt_bigcode import GPTBigCodeConfig
|
|
44
43
|
|
|
45
44
|
|
|
46
|
-
if is_flash_attn_available():
|
|
47
|
-
pass
|
|
48
|
-
|
|
49
|
-
|
|
50
45
|
logger = logging.get_logger(__name__)
|
|
51
46
|
|
|
52
47
|
|
|
@@ -360,9 +355,6 @@ class GPTBigCodePreTrainedModel(PreTrainedModel):
|
|
|
360
355
|
_supports_flash_attn = True
|
|
361
356
|
_supports_sdpa = True
|
|
362
357
|
|
|
363
|
-
def __init__(self, *inputs, **kwargs):
|
|
364
|
-
super().__init__(*inputs, **kwargs)
|
|
365
|
-
|
|
366
358
|
@torch.no_grad()
|
|
367
359
|
def _init_weights(self, module):
|
|
368
360
|
"""Initialize the weights."""
|
|
@@ -377,6 +369,9 @@ class GPTBigCodePreTrainedModel(PreTrainedModel):
|
|
|
377
369
|
init.normal_(
|
|
378
370
|
module.c_proj.weight, mean=0.0, std=self.config.initializer_range / math.sqrt(2 * self.config.n_layer)
|
|
379
371
|
)
|
|
372
|
+
elif isinstance(module, GPTBigCodeModel):
|
|
373
|
+
max_positions = module.config.max_position_embeddings
|
|
374
|
+
init.copy_(module.bias, torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)))
|
|
380
375
|
|
|
381
376
|
|
|
382
377
|
@auto_docstring
|
|
@@ -826,6 +821,7 @@ class GPTBigCodeForTokenClassification(GPTBigCodePreTrainedModel):
|
|
|
826
821
|
output_attentions: Optional[bool] = None,
|
|
827
822
|
output_hidden_states: Optional[bool] = None,
|
|
828
823
|
return_dict: Optional[bool] = None,
|
|
824
|
+
**kwargs,
|
|
829
825
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
830
826
|
r"""
|
|
831
827
|
input_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`):
|