transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -340,7 +340,7 @@ class Sam3VideoProcessor(ProcessorMixin):
|
|
|
340
340
|
|
|
341
341
|
# slice those valid entries from the original outputs
|
|
342
342
|
keep_idx = torch.nonzero(keep, as_tuple=True)[0]
|
|
343
|
-
keep_idx_gpu = keep_idx.
|
|
343
|
+
keep_idx_gpu = keep_idx.to(device=out_binary_masks.device, non_blocking=True)
|
|
344
344
|
|
|
345
345
|
out_obj_ids = torch.index_select(out_obj_ids, 0, keep_idx)
|
|
346
346
|
out_probs = torch.index_select(out_probs, 0, keep_idx)
|
|
@@ -188,6 +188,7 @@ class SamHQVisionConfig(PreTrainedConfig):
|
|
|
188
188
|
self.global_attn_indexes = global_attn_indexes
|
|
189
189
|
self.num_pos_feats = num_pos_feats
|
|
190
190
|
self.mlp_dim = int(hidden_size * mlp_ratio) if mlp_dim is None else mlp_dim
|
|
191
|
+
self.scale = self.hidden_size // 2
|
|
191
192
|
|
|
192
193
|
|
|
193
194
|
class SamHQMaskDecoderConfig(PreTrainedConfig):
|
|
@@ -413,6 +413,29 @@ class SamHQVisionLayer(GradientCheckpointingLayer):
|
|
|
413
413
|
return hidden_states
|
|
414
414
|
|
|
415
415
|
|
|
416
|
+
class SamHQPositionalEmbedding(nn.Module):
|
|
417
|
+
def __init__(self, config):
|
|
418
|
+
super().__init__()
|
|
419
|
+
self.scale = config.scale
|
|
420
|
+
self.register_buffer("positional_embedding", self.scale * torch.randn((2, config.num_pos_feats)))
|
|
421
|
+
|
|
422
|
+
def forward(self, input_coords, input_shape=None):
|
|
423
|
+
"""Positionally encode points that are normalized to [0,1]."""
|
|
424
|
+
coordinates = input_coords.clone()
|
|
425
|
+
|
|
426
|
+
if input_shape is not None:
|
|
427
|
+
coordinates[:, :, :, 0] = coordinates[:, :, :, 0] / input_shape[1]
|
|
428
|
+
coordinates[:, :, :, 1] = coordinates[:, :, :, 1] / input_shape[0]
|
|
429
|
+
|
|
430
|
+
# assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
|
|
431
|
+
coordinates = 2 * coordinates - 1
|
|
432
|
+
coordinates = coordinates.to(self.positional_embedding.dtype)
|
|
433
|
+
coordinates = coordinates @ self.positional_embedding
|
|
434
|
+
coordinates = 2 * np.pi * coordinates
|
|
435
|
+
# outputs d_1 x ... x d_n x channel shape
|
|
436
|
+
return torch.cat([torch.sin(coordinates), torch.cos(coordinates)], dim=-1)
|
|
437
|
+
|
|
438
|
+
|
|
416
439
|
@auto_docstring
|
|
417
440
|
class SamHQPreTrainedModel(PreTrainedModel):
|
|
418
441
|
config: SamHQConfig
|
|
@@ -433,6 +456,8 @@ class SamHQPreTrainedModel(PreTrainedModel):
|
|
|
433
456
|
elif isinstance(module, SamHQVisionEncoder):
|
|
434
457
|
if self.config.use_abs_pos:
|
|
435
458
|
init.zeros_(module.pos_embed)
|
|
459
|
+
elif isinstance(module, SamHQPositionalEmbedding):
|
|
460
|
+
init.normal_(module.positional_embedding, std=module.scale)
|
|
436
461
|
|
|
437
462
|
|
|
438
463
|
class SamHQPatchEmbeddings(nn.Module):
|
|
@@ -525,6 +550,7 @@ class SamHQVisionEncoder(SamHQPreTrainedModel):
|
|
|
525
550
|
self.neck = SamHQVisionNeck(config)
|
|
526
551
|
|
|
527
552
|
self.gradient_checkpointing = False
|
|
553
|
+
self.post_init()
|
|
528
554
|
|
|
529
555
|
def get_input_embeddings(self):
|
|
530
556
|
return self.patch_embed
|
|
@@ -1069,29 +1095,6 @@ class SamHQVisionModel(SamHQPreTrainedModel):
|
|
|
1069
1095
|
return self.vision_encoder(pixel_values, **kwargs)
|
|
1070
1096
|
|
|
1071
1097
|
|
|
1072
|
-
class SamHQPositionalEmbedding(nn.Module):
|
|
1073
|
-
def __init__(self, config):
|
|
1074
|
-
super().__init__()
|
|
1075
|
-
self.scale = config.hidden_size // 2
|
|
1076
|
-
self.register_buffer("positional_embedding", self.scale * torch.randn((2, config.num_pos_feats)))
|
|
1077
|
-
|
|
1078
|
-
def forward(self, input_coords, input_shape=None):
|
|
1079
|
-
"""Positionally encode points that are normalized to [0,1]."""
|
|
1080
|
-
coordinates = input_coords.clone()
|
|
1081
|
-
|
|
1082
|
-
if input_shape is not None:
|
|
1083
|
-
coordinates[:, :, :, 0] = coordinates[:, :, :, 0] / input_shape[1]
|
|
1084
|
-
coordinates[:, :, :, 1] = coordinates[:, :, :, 1] / input_shape[0]
|
|
1085
|
-
|
|
1086
|
-
# assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
|
|
1087
|
-
coordinates = 2 * coordinates - 1
|
|
1088
|
-
coordinates = coordinates.to(self.positional_embedding.dtype)
|
|
1089
|
-
coordinates = coordinates @ self.positional_embedding
|
|
1090
|
-
coordinates = 2 * np.pi * coordinates
|
|
1091
|
-
# outputs d_1 x ... x d_n x channel shape
|
|
1092
|
-
return torch.cat([torch.sin(coordinates), torch.cos(coordinates)], dim=-1)
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
1098
|
class SamHQMaskEmbedding(nn.Module):
|
|
1096
1099
|
def __init__(self, config: SamHQPromptEncoderConfig):
|
|
1097
1100
|
super().__init__()
|
|
@@ -287,18 +287,17 @@ class SeamlessM4TConformerRelPositionalEmbedding(nn.Module):
|
|
|
287
287
|
super().__init__()
|
|
288
288
|
self.max_len = config.max_source_positions
|
|
289
289
|
self.d_model = config.hidden_size
|
|
290
|
-
self.pe =
|
|
291
|
-
self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
|
|
290
|
+
self.register_buffer("pe", self.extend_pe(torch.tensor(0.0).expand(1, self.max_len)), persistent=False)
|
|
292
291
|
|
|
293
|
-
def extend_pe(self, x):
|
|
292
|
+
def extend_pe(self, x, pe=None):
|
|
294
293
|
# Reset the positional encodings
|
|
295
|
-
if
|
|
294
|
+
if pe is not None:
|
|
296
295
|
# self.pe contains both positive and negative parts
|
|
297
296
|
# the length of self.pe is 2 * input_len - 1
|
|
298
|
-
if
|
|
299
|
-
if
|
|
300
|
-
|
|
301
|
-
return
|
|
297
|
+
if pe.size(1) >= x.size(1) * 2 - 1:
|
|
298
|
+
if pe.dtype != x.dtype or pe.device != x.device:
|
|
299
|
+
pe = pe.to(dtype=x.dtype, device=x.device)
|
|
300
|
+
return pe
|
|
302
301
|
# Suppose `i` is the position of query vector and `j` is the
|
|
303
302
|
# position of key vector. We use positive relative positions when keys
|
|
304
303
|
# are to the left (i>j) and negative relative positions otherwise (i<j).
|
|
@@ -319,10 +318,10 @@ class SeamlessM4TConformerRelPositionalEmbedding(nn.Module):
|
|
|
319
318
|
pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
|
|
320
319
|
pe_negative = pe_negative[1:].unsqueeze(0)
|
|
321
320
|
pe = torch.cat([pe_positive, pe_negative], dim=1)
|
|
322
|
-
|
|
321
|
+
return pe.to(device=x.device, dtype=x.dtype)
|
|
323
322
|
|
|
324
323
|
def forward(self, hidden_states: torch.Tensor):
|
|
325
|
-
self.extend_pe(hidden_states)
|
|
324
|
+
self.pe = self.extend_pe(hidden_states, self.pe)
|
|
326
325
|
start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
|
|
327
326
|
end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
|
|
328
327
|
relative_position_embeddings = self.pe[:, start_idx:end_idx]
|
|
@@ -884,13 +883,14 @@ class SeamlessM4TScaledWordEmbedding(nn.Embedding):
|
|
|
884
883
|
return super().forward(input_ids) * self.embed_scale
|
|
885
884
|
|
|
886
885
|
|
|
887
|
-
# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding
|
|
886
|
+
# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding with M2M100->SeamlessM4T
|
|
888
887
|
class SeamlessM4TSinusoidalPositionalEmbedding(nn.Module):
|
|
889
888
|
"""This module produces sinusoidal positional embeddings of any length."""
|
|
890
889
|
|
|
891
890
|
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
|
|
892
891
|
super().__init__()
|
|
893
892
|
self.offset = 2
|
|
893
|
+
self.num_positions = num_positions
|
|
894
894
|
self.embedding_dim = embedding_dim
|
|
895
895
|
self.padding_idx = padding_idx
|
|
896
896
|
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
|
|
@@ -1375,11 +1375,27 @@ class SeamlessM4TPreTrainedModel(PreTrainedModel):
|
|
|
1375
1375
|
elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
|
|
1376
1376
|
init.zeros_(module.bias)
|
|
1377
1377
|
init.ones_(module.weight)
|
|
1378
|
+
if getattr(module, "running_mean", None) is not None:
|
|
1379
|
+
init.zeros_(module.running_mean)
|
|
1380
|
+
init.ones_(module.running_var)
|
|
1381
|
+
init.zeros_(module.num_batches_tracked)
|
|
1378
1382
|
elif isinstance(module, nn.Conv1d):
|
|
1379
1383
|
init.kaiming_normal_(module.weight)
|
|
1380
1384
|
if module.bias is not None:
|
|
1381
1385
|
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
|
|
1382
1386
|
init.uniform_(module.bias, a=-k, b=k)
|
|
1387
|
+
elif isinstance(module, SeamlessM4TSinusoidalPositionalEmbedding):
|
|
1388
|
+
emb_weights = module.get_embedding(
|
|
1389
|
+
module.num_positions + module.offset, module.embedding_dim, module.padding_idx
|
|
1390
|
+
)
|
|
1391
|
+
init.copy_(module.weights, emb_weights)
|
|
1392
|
+
elif isinstance(module, SeamlessM4TConformerRotaryPositionalEmbedding):
|
|
1393
|
+
dim = self.config.hidden_size // self.config.speech_encoder_attention_heads
|
|
1394
|
+
base = self.config.rotary_embedding_base
|
|
1395
|
+
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
|
|
1396
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
1397
|
+
elif isinstance(module, SeamlessM4TConformerRelPositionalEmbedding):
|
|
1398
|
+
init.copy_(module.pe, module.extend_pe(torch.tensor(0.0).expand(1, module.max_len)))
|
|
1383
1399
|
|
|
1384
1400
|
def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
|
|
1385
1401
|
kernel_size, stride = self.config.adaptor_kernel_size, self.config.adaptor_stride
|
|
@@ -1770,6 +1786,7 @@ class SeamlessM4TDecoder(SeamlessM4TPreTrainedModel):
|
|
|
1770
1786
|
output_hidden_states: Optional[bool] = None,
|
|
1771
1787
|
return_dict: Optional[bool] = None,
|
|
1772
1788
|
cache_position: Optional[torch.Tensor] = None,
|
|
1789
|
+
**kwargs,
|
|
1773
1790
|
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
|
1774
1791
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
1775
1792
|
output_hidden_states = (
|
|
@@ -1914,6 +1931,7 @@ class SeamlessM4TTextToUnitModel(SeamlessM4TPreTrainedModel):
|
|
|
1914
1931
|
output_hidden_states: Optional[bool] = None,
|
|
1915
1932
|
return_dict: Optional[bool] = None,
|
|
1916
1933
|
cache_position: Optional[torch.Tensor] = None,
|
|
1934
|
+
**kwargs,
|
|
1917
1935
|
) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
|
|
1918
1936
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
1919
1937
|
output_hidden_states = (
|
|
@@ -2035,6 +2053,7 @@ class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel,
|
|
|
2035
2053
|
output_hidden_states: Optional[bool] = None,
|
|
2036
2054
|
return_dict: Optional[bool] = None,
|
|
2037
2055
|
cache_position: Optional[torch.Tensor] = None,
|
|
2056
|
+
**kwargs,
|
|
2038
2057
|
) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
|
|
2039
2058
|
r"""
|
|
2040
2059
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -2354,7 +2373,7 @@ class SeamlessM4TCodeHifiGan(PreTrainedModel):
|
|
|
2354
2373
|
return input_lengths
|
|
2355
2374
|
|
|
2356
2375
|
def forward(
|
|
2357
|
-
self, input_ids: torch.LongTensor, spkr_id: torch.Tensor, lang_id: torch.Tensor
|
|
2376
|
+
self, input_ids: torch.LongTensor, spkr_id: torch.Tensor, lang_id: torch.Tensor, **kwargs
|
|
2358
2377
|
) -> tuple[torch.Tensor]:
|
|
2359
2378
|
"""
|
|
2360
2379
|
Args:
|
|
@@ -2996,6 +3015,7 @@ class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin):
|
|
|
2996
3015
|
output_hidden_states: Optional[bool] = None,
|
|
2997
3016
|
return_dict: Optional[bool] = None,
|
|
2998
3017
|
cache_position: Optional[torch.Tensor] = None,
|
|
3018
|
+
**kwargs,
|
|
2999
3019
|
) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
|
|
3000
3020
|
r"""
|
|
3001
3021
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -60,7 +60,7 @@ class SeamlessM4TTokenizer(TokenizersBackend):
|
|
|
60
60
|
Args:
|
|
61
61
|
vocab (`list` or `dict`, *optional*):
|
|
62
62
|
List of (token, score) tuples or dict mapping tokens to indices. If not provided, uses default vocab.
|
|
63
|
-
merges (`list`, *optional*):
|
|
63
|
+
merges (`str` or `list`, *optional*):
|
|
64
64
|
List of merge rules for BPE model. If not provided, uses empty list.
|
|
65
65
|
bos_token (`str`, *optional*, defaults to `"<s>"`):
|
|
66
66
|
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
|
@@ -104,15 +104,15 @@ class SeamlessM4TTokenizer(TokenizersBackend):
|
|
|
104
104
|
|
|
105
105
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
106
106
|
model_input_names = ["input_ids", "attention_mask"]
|
|
107
|
-
|
|
107
|
+
model = BPE
|
|
108
108
|
|
|
109
|
-
prefix_tokens: list[int] =
|
|
110
|
-
suffix_tokens: list[int] =
|
|
109
|
+
prefix_tokens: list[int] = None
|
|
110
|
+
suffix_tokens: list[int] = None
|
|
111
111
|
|
|
112
112
|
def __init__(
|
|
113
113
|
self,
|
|
114
|
-
vocab: Optional[
|
|
115
|
-
merges: Optional[list] = None,
|
|
114
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
115
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
116
116
|
bos_token="<s>",
|
|
117
117
|
eos_token="</s>",
|
|
118
118
|
sep_token="</s>",
|
|
@@ -126,59 +126,14 @@ class SeamlessM4TTokenizer(TokenizersBackend):
|
|
|
126
126
|
vocab_file=None,
|
|
127
127
|
**kwargs,
|
|
128
128
|
):
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
# Process vocab - SeamlessM4T uses fairseq vocab alignment: <pad>=0, <unk>=1, <s>=2, </s>=3, then SPM pieces[3:]
|
|
138
|
-
if isinstance(vocab, list):
|
|
139
|
-
# Convert list of (token, score) tuples to dict {token: idx}
|
|
140
|
-
# Check if vocab is already in SeamlessM4T order (pad, unk, s, /s) or tokenizer.json order (unk, s, /s, ...)
|
|
141
|
-
first_tokens = [str(item[0]) if isinstance(item, (list, tuple)) else str(item) for item in vocab[:4]]
|
|
142
|
-
is_seamless_order = (
|
|
143
|
-
len(first_tokens) >= 4
|
|
144
|
-
and first_tokens[0] == str(pad_token)
|
|
145
|
-
and first_tokens[1] == str(unk_token)
|
|
146
|
-
and first_tokens[2] == str(bos_token)
|
|
147
|
-
and first_tokens[3] == str(eos_token)
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
if is_seamless_order:
|
|
151
|
-
# Already in correct order, use list index directly as token ID
|
|
152
|
-
vocab_dict = {}
|
|
153
|
-
for idx, item in enumerate(vocab):
|
|
154
|
-
token = str(item[0]) if isinstance(item, (list, tuple)) else str(item)
|
|
155
|
-
vocab_dict[token] = idx
|
|
156
|
-
self._vocab = vocab_dict
|
|
157
|
-
else:
|
|
158
|
-
# Reorder to fairseq: <pad>, <unk>, <s>, </s>, ... (rest of vocab)
|
|
159
|
-
vocab_dict = {}
|
|
160
|
-
vocab_dict[str(pad_token)] = 0
|
|
161
|
-
vocab_dict[str(unk_token)] = 1
|
|
162
|
-
vocab_dict[str(bos_token)] = 2
|
|
163
|
-
vocab_dict[str(eos_token)] = 3
|
|
164
|
-
|
|
165
|
-
# Add rest of vocab starting from index 4, skipping tokens we already added
|
|
166
|
-
idx = 4
|
|
167
|
-
for item in vocab:
|
|
168
|
-
token = str(item[0]) if isinstance(item, (list, tuple)) else str(item)
|
|
169
|
-
if token not in vocab_dict:
|
|
170
|
-
vocab_dict[token] = idx
|
|
171
|
-
idx += 1
|
|
172
|
-
|
|
173
|
-
self._vocab = vocab_dict
|
|
174
|
-
else:
|
|
175
|
-
self._vocab = vocab
|
|
176
|
-
|
|
177
|
-
if merges is None:
|
|
178
|
-
self._merges = []
|
|
179
|
-
else:
|
|
180
|
-
self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
|
|
129
|
+
self._vocab = vocab or {
|
|
130
|
+
str(pad_token): 0,
|
|
131
|
+
str(unk_token): 1,
|
|
132
|
+
str(bos_token): 2,
|
|
133
|
+
str(eos_token): 3,
|
|
134
|
+
}
|
|
181
135
|
|
|
136
|
+
self._merges = merges or []
|
|
182
137
|
self._tokenizer = Tokenizer(
|
|
183
138
|
BPE(
|
|
184
139
|
vocab=self._vocab,
|
|
@@ -216,7 +171,6 @@ class SeamlessM4TTokenizer(TokenizersBackend):
|
|
|
216
171
|
kwargs.setdefault("additional_special_tokens", additional_special_tokens)
|
|
217
172
|
|
|
218
173
|
super().__init__(
|
|
219
|
-
tokenizer_object=self._tokenizer,
|
|
220
174
|
bos_token=bos_token,
|
|
221
175
|
eos_token=eos_token,
|
|
222
176
|
sep_token=sep_token,
|
|
@@ -245,6 +199,20 @@ class SeamlessM4TTokenizer(TokenizersBackend):
|
|
|
245
199
|
|
|
246
200
|
self.set_tgt_lang_special_tokens(self._tgt_lang)
|
|
247
201
|
|
|
202
|
+
@classmethod
|
|
203
|
+
def convert_from_spm_model(cls, vocab, **kwargs):
|
|
204
|
+
"""When converting from spm, offset is needed to account for special tokens."""
|
|
205
|
+
_vocab = {
|
|
206
|
+
"<pad>": 0,
|
|
207
|
+
"<unk>": 1,
|
|
208
|
+
"<s>": 2,
|
|
209
|
+
"</s>": 3,
|
|
210
|
+
}
|
|
211
|
+
for i, token in enumerate(list(vocab.keys())):
|
|
212
|
+
_vocab[token] = i + 1 # offset by 1 to account for special tokens
|
|
213
|
+
kwargs["vocab"] = _vocab
|
|
214
|
+
return kwargs
|
|
215
|
+
|
|
248
216
|
@property
|
|
249
217
|
def src_lang(self) -> str:
|
|
250
218
|
return self._src_lang
|
|
@@ -762,6 +762,7 @@ class SeamlessM4Tv2SinusoidalPositionalEmbedding(nn.Module):
|
|
|
762
762
|
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
|
|
763
763
|
super().__init__()
|
|
764
764
|
self.offset = 2
|
|
765
|
+
self.num_positions = num_positions
|
|
765
766
|
self.embedding_dim = embedding_dim
|
|
766
767
|
self.padding_idx = padding_idx
|
|
767
768
|
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
|
|
@@ -1292,6 +1293,11 @@ class SeamlessM4Tv2PreTrainedModel(PreTrainedModel):
|
|
|
1292
1293
|
if module.bias is not None:
|
|
1293
1294
|
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
|
|
1294
1295
|
init.uniform_(module.bias, a=-k, b=k)
|
|
1296
|
+
elif isinstance(module, SeamlessM4Tv2SinusoidalPositionalEmbedding):
|
|
1297
|
+
emb_weights = module.get_embedding(
|
|
1298
|
+
module.num_positions + module.offset, module.embedding_dim, module.padding_idx
|
|
1299
|
+
)
|
|
1300
|
+
init.copy_(module.weights, emb_weights)
|
|
1295
1301
|
|
|
1296
1302
|
# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TPreTrainedModel._compute_sub_sample_lengths_from_attention_mask
|
|
1297
1303
|
def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
|
|
@@ -1812,6 +1818,7 @@ class SeamlessM4Tv2Decoder(SeamlessM4Tv2PreTrainedModel):
|
|
|
1812
1818
|
output_hidden_states: Optional[bool] = None,
|
|
1813
1819
|
return_dict: Optional[bool] = None,
|
|
1814
1820
|
cache_position: Optional[torch.Tensor] = None,
|
|
1821
|
+
**kwargs,
|
|
1815
1822
|
) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
|
|
1816
1823
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
1817
1824
|
output_hidden_states = (
|
|
@@ -1995,6 +2002,7 @@ class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
|
|
|
1995
2002
|
output_attentions: Optional[bool] = None,
|
|
1996
2003
|
output_hidden_states: Optional[bool] = None,
|
|
1997
2004
|
return_dict: Optional[bool] = None,
|
|
2005
|
+
**kwargs,
|
|
1998
2006
|
) -> Union[tuple, SeamlessM4Tv2TextToUnitDecoderOutput]:
|
|
1999
2007
|
r"""
|
|
2000
2008
|
Args:
|
|
@@ -2122,6 +2130,7 @@ class SeamlessM4Tv2TextToUnitModel(SeamlessM4Tv2PreTrainedModel):
|
|
|
2122
2130
|
output_attentions: Optional[bool] = None,
|
|
2123
2131
|
output_hidden_states: Optional[bool] = None,
|
|
2124
2132
|
return_dict: Optional[bool] = None,
|
|
2133
|
+
**kwargs,
|
|
2125
2134
|
) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
|
|
2126
2135
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
2127
2136
|
output_hidden_states = (
|
|
@@ -2556,7 +2565,7 @@ class SeamlessM4Tv2CodeHifiGan(PreTrainedModel):
|
|
|
2556
2565
|
|
|
2557
2566
|
# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.forward with SeamlessM4T->SeamlessM4Tv2, spkr_id->speaker_id
|
|
2558
2567
|
def forward(
|
|
2559
|
-
self, input_ids: torch.LongTensor, speaker_id: torch.Tensor, lang_id: torch.Tensor
|
|
2568
|
+
self, input_ids: torch.LongTensor, speaker_id: torch.Tensor, lang_id: torch.Tensor, **kwargs
|
|
2560
2569
|
) -> tuple[torch.Tensor]:
|
|
2561
2570
|
"""
|
|
2562
2571
|
Args:
|
|
@@ -3214,6 +3223,7 @@ class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin
|
|
|
3214
3223
|
output_hidden_states: Optional[bool] = None,
|
|
3215
3224
|
return_dict: Optional[bool] = None,
|
|
3216
3225
|
cache_position: Optional[torch.Tensor] = None,
|
|
3226
|
+
**kwargs,
|
|
3217
3227
|
) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
|
|
3218
3228
|
r"""
|
|
3219
3229
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -40,7 +40,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
40
40
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
41
41
|
from ...processing_utils import Unpack
|
|
42
42
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
43
|
-
from ...utils.generic import check_model_inputs
|
|
43
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
44
44
|
from .configuration_seed_oss import SeedOssConfig
|
|
45
45
|
|
|
46
46
|
|
|
@@ -311,7 +311,7 @@ class SeedOssRotaryEmbedding(nn.Module):
|
|
|
311
311
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
312
312
|
|
|
313
313
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
314
|
-
self.original_inv_freq =
|
|
314
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
315
315
|
|
|
316
316
|
@staticmethod
|
|
317
317
|
def compute_default_rope_parameters(
|
|
@@ -350,7 +350,7 @@ class SeedOssRotaryEmbedding(nn.Module):
|
|
|
350
350
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
351
351
|
|
|
352
352
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
353
|
-
with
|
|
353
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
354
354
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
355
355
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
356
356
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -168,7 +168,6 @@ class SegformerImageProcessorFast(BaseImageProcessorFast):
|
|
|
168
168
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
169
169
|
|
|
170
170
|
# Stack images into a single tensor if return_tensors is set
|
|
171
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
172
171
|
|
|
173
172
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
174
173
|
|
|
@@ -434,6 +434,7 @@ class SegformerModel(SegformerPreTrainedModel):
|
|
|
434
434
|
output_attentions: Optional[bool] = None,
|
|
435
435
|
output_hidden_states: Optional[bool] = None,
|
|
436
436
|
return_dict: Optional[bool] = None,
|
|
437
|
+
**kwargs,
|
|
437
438
|
) -> Union[tuple, BaseModelOutput]:
|
|
438
439
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
439
440
|
output_hidden_states = (
|
|
@@ -486,6 +487,7 @@ class SegformerForImageClassification(SegformerPreTrainedModel):
|
|
|
486
487
|
output_attentions: Optional[bool] = None,
|
|
487
488
|
output_hidden_states: Optional[bool] = None,
|
|
488
489
|
return_dict: Optional[bool] = None,
|
|
490
|
+
**kwargs,
|
|
489
491
|
) -> Union[tuple, SegFormerImageClassifierOutput]:
|
|
490
492
|
r"""
|
|
491
493
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -547,9 +549,9 @@ class SegformerMLP(nn.Module):
|
|
|
547
549
|
return hidden_states
|
|
548
550
|
|
|
549
551
|
|
|
550
|
-
class SegformerDecodeHead(
|
|
552
|
+
class SegformerDecodeHead(nn.Module):
|
|
551
553
|
def __init__(self, config):
|
|
552
|
-
super().__init__(
|
|
554
|
+
super().__init__()
|
|
553
555
|
# linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
|
|
554
556
|
mlps = []
|
|
555
557
|
for i in range(config.num_encoder_blocks):
|
|
@@ -572,7 +574,7 @@ class SegformerDecodeHead(SegformerPreTrainedModel):
|
|
|
572
574
|
|
|
573
575
|
self.config = config
|
|
574
576
|
|
|
575
|
-
def forward(self, encoder_hidden_states: torch.FloatTensor) -> torch.Tensor:
|
|
577
|
+
def forward(self, encoder_hidden_states: torch.FloatTensor, **kwargs) -> torch.Tensor:
|
|
576
578
|
batch_size = encoder_hidden_states[-1].shape[0]
|
|
577
579
|
|
|
578
580
|
all_hidden_states = ()
|
|
@@ -627,6 +629,7 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
|
|
|
627
629
|
output_attentions: Optional[bool] = None,
|
|
628
630
|
output_hidden_states: Optional[bool] = None,
|
|
629
631
|
return_dict: Optional[bool] = None,
|
|
632
|
+
**kwargs,
|
|
630
633
|
) -> Union[tuple, SemanticSegmenterOutput]:
|
|
631
634
|
r"""
|
|
632
635
|
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
|
|
@@ -140,7 +140,6 @@ class SegformerImageProcessorFast(BeitImageProcessorFast):
|
|
|
140
140
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
141
141
|
|
|
142
142
|
# Stack images into a single tensor if return_tensors is set
|
|
143
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
144
143
|
|
|
145
144
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
146
145
|
|
|
@@ -647,6 +647,7 @@ class SegGptModel(SegGptPreTrainedModel):
|
|
|
647
647
|
output_attentions: Optional[bool] = None,
|
|
648
648
|
output_hidden_states: Optional[bool] = None,
|
|
649
649
|
return_dict: Optional[bool] = None,
|
|
650
|
+
**kwargs,
|
|
650
651
|
) -> Union[tuple, SegGptEncoderOutput]:
|
|
651
652
|
r"""
|
|
652
653
|
prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
|
@@ -843,6 +844,7 @@ class SegGptForImageSegmentation(SegGptPreTrainedModel):
|
|
|
843
844
|
output_attentions: Optional[bool] = None,
|
|
844
845
|
output_hidden_states: Optional[bool] = None,
|
|
845
846
|
return_dict: Optional[bool] = None,
|
|
847
|
+
**kwargs,
|
|
846
848
|
) -> Union[tuple, SegGptImageSegmentationOutput]:
|
|
847
849
|
r"""
|
|
848
850
|
prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
|
@@ -773,6 +773,7 @@ class SEWModel(SEWPreTrainedModel):
|
|
|
773
773
|
output_attentions: Optional[bool] = None,
|
|
774
774
|
output_hidden_states: Optional[bool] = None,
|
|
775
775
|
return_dict: Optional[bool] = None,
|
|
776
|
+
**kwargs,
|
|
776
777
|
) -> Union[tuple, BaseModelOutput]:
|
|
777
778
|
r"""
|
|
778
779
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -902,6 +903,7 @@ class SEWForCTC(SEWPreTrainedModel):
|
|
|
902
903
|
output_hidden_states: Optional[bool] = None,
|
|
903
904
|
return_dict: Optional[bool] = None,
|
|
904
905
|
labels: Optional[torch.Tensor] = None,
|
|
906
|
+
**kwargs,
|
|
905
907
|
) -> Union[tuple, CausalLMOutput]:
|
|
906
908
|
r"""
|
|
907
909
|
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
|
@@ -1013,6 +1015,7 @@ class SEWForSequenceClassification(SEWPreTrainedModel):
|
|
|
1013
1015
|
output_hidden_states: Optional[bool] = None,
|
|
1014
1016
|
return_dict: Optional[bool] = None,
|
|
1015
1017
|
labels: Optional[torch.Tensor] = None,
|
|
1018
|
+
**kwargs,
|
|
1016
1019
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1017
1020
|
r"""
|
|
1018
1021
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -392,6 +392,7 @@ class SEWModel(SEWPreTrainedModel):
|
|
|
392
392
|
output_attentions: Optional[bool] = None,
|
|
393
393
|
output_hidden_states: Optional[bool] = None,
|
|
394
394
|
return_dict: Optional[bool] = None,
|
|
395
|
+
**kwargs,
|
|
395
396
|
) -> Union[tuple, BaseModelOutput]:
|
|
396
397
|
r"""
|
|
397
398
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1318,6 +1318,7 @@ class SEWDModel(SEWDPreTrainedModel):
|
|
|
1318
1318
|
output_attentions: Optional[bool] = None,
|
|
1319
1319
|
output_hidden_states: Optional[bool] = None,
|
|
1320
1320
|
return_dict: Optional[bool] = None,
|
|
1321
|
+
**kwargs,
|
|
1321
1322
|
) -> Union[tuple, BaseModelOutput]:
|
|
1322
1323
|
r"""
|
|
1323
1324
|
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1445,6 +1446,7 @@ class SEWDForCTC(SEWDPreTrainedModel):
|
|
|
1445
1446
|
output_hidden_states: Optional[bool] = None,
|
|
1446
1447
|
return_dict: Optional[bool] = None,
|
|
1447
1448
|
labels: Optional[torch.Tensor] = None,
|
|
1449
|
+
**kwargs,
|
|
1448
1450
|
) -> Union[tuple, CausalLMOutput]:
|
|
1449
1451
|
r"""
|
|
1450
1452
|
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
|
@@ -1557,6 +1559,7 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel):
|
|
|
1557
1559
|
output_hidden_states: Optional[bool] = None,
|
|
1558
1560
|
return_dict: Optional[bool] = None,
|
|
1559
1561
|
labels: Optional[torch.Tensor] = None,
|
|
1562
|
+
**kwargs,
|
|
1560
1563
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1561
1564
|
r"""
|
|
1562
1565
|
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -57,6 +57,7 @@ class ShieldGemma2ForImageClassification(PreTrainedModel):
|
|
|
57
57
|
self.yes_token_index = getattr(config, "yes_token_index", 10_784)
|
|
58
58
|
self.no_token_index = getattr(config, "no_token_index", 3771)
|
|
59
59
|
self.model = AutoModelForImageTextToText.from_config(config=config)
|
|
60
|
+
self.post_init()
|
|
60
61
|
|
|
61
62
|
def get_input_embeddings(self):
|
|
62
63
|
return self.model.language_model.get_input_embeddings()
|
|
@@ -430,6 +430,8 @@ class SiglipPreTrainedModel(PreTrainedModel):
|
|
|
430
430
|
else self.config.hidden_size
|
|
431
431
|
)
|
|
432
432
|
init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
|
|
433
|
+
if hasattr(module, "position_ids"):
|
|
434
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
433
435
|
elif isinstance(module, nn.Embedding):
|
|
434
436
|
default_flax_embed_init(module.weight)
|
|
435
437
|
elif isinstance(module, SiglipAttention):
|
|
@@ -465,6 +467,8 @@ class SiglipPreTrainedModel(PreTrainedModel):
|
|
|
465
467
|
elif isinstance(module, nn.LayerNorm):
|
|
466
468
|
init.zeros_(module.bias)
|
|
467
469
|
init.ones_(module.weight)
|
|
470
|
+
elif isinstance(module, SiglipTextEmbeddings):
|
|
471
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
468
472
|
|
|
469
473
|
|
|
470
474
|
# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->Siglip
|
|
@@ -502,9 +506,11 @@ class SiglipEncoder(nn.Module):
|
|
|
502
506
|
return BaseModelOutput(last_hidden_state=hidden_states)
|
|
503
507
|
|
|
504
508
|
|
|
505
|
-
class SiglipTextTransformer(
|
|
509
|
+
class SiglipTextTransformer(SiglipPreTrainedModel):
|
|
510
|
+
_input_embed_layer = "token_embedding"
|
|
511
|
+
|
|
506
512
|
def __init__(self, config: SiglipTextConfig):
|
|
507
|
-
super().__init__()
|
|
513
|
+
super().__init__(config)
|
|
508
514
|
self.config = config
|
|
509
515
|
embed_dim = config.hidden_size
|
|
510
516
|
self.embeddings = SiglipTextEmbeddings(config)
|
|
@@ -512,6 +518,7 @@ class SiglipTextTransformer(nn.Module):
|
|
|
512
518
|
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
|
513
519
|
|
|
514
520
|
self.head = nn.Linear(embed_dim, config.projection_size)
|
|
521
|
+
self.post_init()
|
|
515
522
|
|
|
516
523
|
@can_return_tuple
|
|
517
524
|
@auto_docstring
|
|
@@ -614,6 +621,7 @@ class SiglipTextModel(SiglipPreTrainedModel):
|
|
|
614
621
|
|
|
615
622
|
|
|
616
623
|
class SiglipVisionTransformer(SiglipPreTrainedModel):
|
|
624
|
+
_input_embed_layer = "patch_embedding"
|
|
617
625
|
_can_record_outputs = {
|
|
618
626
|
"hidden_states": SiglipEncoderLayer,
|
|
619
627
|
"attentions": SiglipAttention,
|
|
@@ -631,6 +639,8 @@ class SiglipVisionTransformer(SiglipPreTrainedModel):
|
|
|
631
639
|
if self.use_head:
|
|
632
640
|
self.head = SiglipMultiheadAttentionPoolingHead(config)
|
|
633
641
|
|
|
642
|
+
self.post_init()
|
|
643
|
+
|
|
634
644
|
@check_model_inputs(tie_last_hidden_states=False)
|
|
635
645
|
@auto_docstring
|
|
636
646
|
def forward(
|
|
@@ -774,6 +784,12 @@ class SiglipModel(SiglipPreTrainedModel):
|
|
|
774
784
|
# Initialize weights and apply final processing
|
|
775
785
|
self.post_init()
|
|
776
786
|
|
|
787
|
+
def get_input_embeddings(self) -> nn.Module:
|
|
788
|
+
return self.text_model.embeddings.token_embedding
|
|
789
|
+
|
|
790
|
+
def set_input_embeddings(self, value: nn.Module):
|
|
791
|
+
self.text_model.embeddings.token_embedding = value
|
|
792
|
+
|
|
777
793
|
@filter_out_non_signature_kwargs()
|
|
778
794
|
@auto_docstring
|
|
779
795
|
def get_text_features(
|
|
@@ -969,6 +985,12 @@ class SiglipForImageClassification(SiglipPreTrainedModel):
|
|
|
969
985
|
# Initialize weights and apply final processing
|
|
970
986
|
self.post_init()
|
|
971
987
|
|
|
988
|
+
def get_input_embeddings(self) -> nn.Module:
|
|
989
|
+
return self.vision_model.embeddings.patch_embedding
|
|
990
|
+
|
|
991
|
+
def set_input_embeddings(self, value: nn.Module):
|
|
992
|
+
self.vision_model.embeddings.patch_embedding = value
|
|
993
|
+
|
|
972
994
|
@check_model_inputs
|
|
973
995
|
@auto_docstring
|
|
974
996
|
def forward(
|