transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -39,6 +39,7 @@ from ...utils import (
|
|
|
39
39
|
requires_backends,
|
|
40
40
|
)
|
|
41
41
|
from ...utils.backbone_utils import load_backbone
|
|
42
|
+
from ...utils.generic import maybe_autocast
|
|
42
43
|
from .configuration_oneformer import OneFormerConfig
|
|
43
44
|
|
|
44
45
|
|
|
@@ -322,7 +323,7 @@ class OneFormerHungarianMatcher(nn.Module):
|
|
|
322
323
|
align_corners=False,
|
|
323
324
|
).squeeze(1)
|
|
324
325
|
|
|
325
|
-
with
|
|
326
|
+
with maybe_autocast(device_type="cuda", enabled=False):
|
|
326
327
|
pred_mask = pred_mask.float()
|
|
327
328
|
target_mask = target_mask.float()
|
|
328
329
|
|
|
@@ -934,44 +935,6 @@ class OneFormerForUniversalSegmentationOutput(ModelOutput):
|
|
|
934
935
|
attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
|
935
936
|
|
|
936
937
|
|
|
937
|
-
# Modified from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrFrozenBatchNorm2d with DeformableDetr->OneFormerPixelDecoder
|
|
938
|
-
class OneFormerPixelDecoderFrozenBatchNorm2d(nn.Module):
|
|
939
|
-
"""
|
|
940
|
-
BatchNorm2d where the batch statistics and the affine parameters are fixed.
|
|
941
|
-
|
|
942
|
-
Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
|
|
943
|
-
torchvision.models.resnet[18,34,50,101] produce nans.
|
|
944
|
-
"""
|
|
945
|
-
|
|
946
|
-
def __init__(self, n):
|
|
947
|
-
super().__init__()
|
|
948
|
-
self.register_buffer("weight", torch.ones(n))
|
|
949
|
-
self.register_buffer("bias", torch.zeros(n))
|
|
950
|
-
self.register_buffer("running_mean", torch.zeros(n))
|
|
951
|
-
self.register_buffer("running_var", torch.ones(n))
|
|
952
|
-
|
|
953
|
-
def _load_from_state_dict(
|
|
954
|
-
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
|
|
955
|
-
):
|
|
956
|
-
num_batches_tracked_key = prefix + "num_batches_tracked"
|
|
957
|
-
if num_batches_tracked_key in state_dict:
|
|
958
|
-
del state_dict[num_batches_tracked_key]
|
|
959
|
-
|
|
960
|
-
super()._load_from_state_dict(
|
|
961
|
-
state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
|
|
962
|
-
)
|
|
963
|
-
|
|
964
|
-
def forward(self, x):
|
|
965
|
-
weight = self.weight.reshape(1, -1, 1, 1)
|
|
966
|
-
bias = self.bias.reshape(1, -1, 1, 1)
|
|
967
|
-
running_var = self.running_var.reshape(1, -1, 1, 1)
|
|
968
|
-
running_mean = self.running_mean.reshape(1, -1, 1, 1)
|
|
969
|
-
epsilon = 1e-5
|
|
970
|
-
scale = weight * (running_var + epsilon).rsqrt()
|
|
971
|
-
bias = bias - running_mean * scale
|
|
972
|
-
return x * scale + bias
|
|
973
|
-
|
|
974
|
-
|
|
975
938
|
# Modified from transformers.models.detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->OneFormerPixelDecoderEncoder
|
|
976
939
|
class OneFormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
|
|
977
940
|
"""
|
|
@@ -2832,6 +2795,10 @@ class OneFormerPreTrainedModel(PreTrainedModel):
|
|
|
2832
2795
|
init.normal_(module.weight, mean=0.0, std=std)
|
|
2833
2796
|
if module.bias is not None:
|
|
2834
2797
|
init.zeros_(module.bias)
|
|
2798
|
+
if getattr(module, "running_mean", None) is not None:
|
|
2799
|
+
init.zeros_(module.running_mean)
|
|
2800
|
+
init.ones_(module.running_var)
|
|
2801
|
+
init.zeros_(module.num_batches_tracked)
|
|
2835
2802
|
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
|
|
2836
2803
|
init.ones_(module.weight)
|
|
2837
2804
|
init.zeros_(module.bias)
|
|
@@ -2842,6 +2809,9 @@ class OneFormerPreTrainedModel(PreTrainedModel):
|
|
|
2842
2809
|
init.zeros_(module.weight[module.padding_idx])
|
|
2843
2810
|
elif isinstance(module, OneFormerLoss):
|
|
2844
2811
|
init.constant_(module.logit_scale, np.log(1 / self.config.contrastive_temperature))
|
|
2812
|
+
empty_weight = torch.ones(module.num_classes + 1)
|
|
2813
|
+
empty_weight[-1] = module.eos_coef
|
|
2814
|
+
init.copy_(module.empty_weight, empty_weight)
|
|
2845
2815
|
|
|
2846
2816
|
|
|
2847
2817
|
@auto_docstring
|
|
@@ -2872,6 +2842,7 @@ class OneFormerModel(OneFormerPreTrainedModel):
|
|
|
2872
2842
|
output_hidden_states: Optional[bool] = None,
|
|
2873
2843
|
output_attentions: Optional[bool] = None,
|
|
2874
2844
|
return_dict: Optional[bool] = None,
|
|
2845
|
+
**kwargs,
|
|
2875
2846
|
) -> OneFormerModelOutput:
|
|
2876
2847
|
r"""
|
|
2877
2848
|
task_inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -3058,6 +3029,7 @@ class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel):
|
|
|
3058
3029
|
output_hidden_states: Optional[bool] = None,
|
|
3059
3030
|
output_attentions: Optional[bool] = None,
|
|
3060
3031
|
return_dict: Optional[bool] = None,
|
|
3032
|
+
**kwargs,
|
|
3061
3033
|
) -> OneFormerForUniversalSegmentationOutput:
|
|
3062
3034
|
r"""
|
|
3063
3035
|
task_inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -24,6 +24,7 @@ import torch
|
|
|
24
24
|
from torch import nn
|
|
25
25
|
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
26
26
|
|
|
27
|
+
from ... import initialization as init
|
|
27
28
|
from ...activations import gelu_new, get_activation, silu
|
|
28
29
|
from ...generation import GenerationMixin
|
|
29
30
|
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
|
|
@@ -46,6 +47,7 @@ ACT_FNS = {"relu": nn.ReLU(), "silu": silu, "gelu": gelu_new, "swish": silu}
|
|
|
46
47
|
class Attention(nn.Module):
|
|
47
48
|
def __init__(self, nx, n_positions, config, scale=False):
|
|
48
49
|
super().__init__()
|
|
50
|
+
self.n_positions = n_positions
|
|
49
51
|
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
|
50
52
|
if n_state % config.n_head != 0:
|
|
51
53
|
raise ValueError(f"Attention n_state shape: {n_state} must be divisible by config.n_head {config.n_head}")
|
|
@@ -259,6 +261,16 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
|
|
259
261
|
config: OpenAIGPTConfig
|
|
260
262
|
base_model_prefix = "transformer"
|
|
261
263
|
|
|
264
|
+
def _init_weights(self, module):
|
|
265
|
+
super()._init_weights(module)
|
|
266
|
+
if isinstance(module, Attention):
|
|
267
|
+
n_positions = module.n_positions
|
|
268
|
+
init.copy_(
|
|
269
|
+
module.bias, torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions)
|
|
270
|
+
)
|
|
271
|
+
elif isinstance(module, OpenAIGPTModel):
|
|
272
|
+
init.copy_(module.position_ids, torch.arange(module.config.n_positions))
|
|
273
|
+
|
|
262
274
|
|
|
263
275
|
@dataclass
|
|
264
276
|
@auto_docstring(
|
|
@@ -317,6 +329,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|
|
317
329
|
output_attentions: Optional[bool] = None,
|
|
318
330
|
output_hidden_states: Optional[bool] = None,
|
|
319
331
|
return_dict: Optional[bool] = None,
|
|
332
|
+
**kwargs,
|
|
320
333
|
) -> Union[tuple[torch.Tensor], BaseModelOutput]:
|
|
321
334
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
322
335
|
output_hidden_states = (
|
|
@@ -514,6 +527,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|
|
514
527
|
output_attentions: Optional[bool] = None,
|
|
515
528
|
output_hidden_states: Optional[bool] = None,
|
|
516
529
|
return_dict: Optional[bool] = None,
|
|
530
|
+
**kwargs,
|
|
517
531
|
) -> Union[tuple[torch.Tensor], OpenAIGPTDoubleHeadsModelOutput]:
|
|
518
532
|
r"""
|
|
519
533
|
mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
|
|
@@ -624,6 +638,7 @@ class OpenAIGPTForSequenceClassification(OpenAIGPTPreTrainedModel):
|
|
|
624
638
|
output_attentions: Optional[bool] = None,
|
|
625
639
|
output_hidden_states: Optional[bool] = None,
|
|
626
640
|
return_dict: Optional[bool] = None,
|
|
641
|
+
**kwargs,
|
|
627
642
|
) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
|
|
628
643
|
r"""
|
|
629
644
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -14,10 +14,11 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""Tokenization classes for OpenAI GPT."""
|
|
16
16
|
|
|
17
|
+
from typing import Optional, Union
|
|
18
|
+
|
|
17
19
|
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
|
|
18
20
|
from tokenizers.models import BPE
|
|
19
21
|
|
|
20
|
-
from ...convert_slow_tokenizer import generate_merges
|
|
21
22
|
from ...tokenization_utils_tokenizers import TokenizersBackend
|
|
22
23
|
from ...utils import logging
|
|
23
24
|
|
|
@@ -48,40 +49,26 @@ class OpenAIGPTTokenizer(TokenizersBackend):
|
|
|
48
49
|
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
|
49
50
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
|
50
51
|
token instead.
|
|
51
|
-
vocab (`dict`, *optional*):
|
|
52
|
+
vocab (`str` or `dict[str, int]`, *optional*):
|
|
52
53
|
Custom vocabulary dictionary. If not provided, a blank vocabulary is initialized.
|
|
53
|
-
merges (`list`, *optional*):
|
|
54
|
+
merges (`str` or `list[str]`, *optional*):
|
|
54
55
|
Custom merges list. If not provided, an empty list is used.
|
|
55
56
|
"""
|
|
56
57
|
|
|
57
58
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
58
59
|
model_input_names = ["input_ids", "attention_mask"]
|
|
60
|
+
model = BPE
|
|
59
61
|
|
|
60
62
|
def __init__(
|
|
61
63
|
self,
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
vocab_file=None,
|
|
66
|
-
merges_file=None,
|
|
64
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
65
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
66
|
+
unk_token: str = "<unk>",
|
|
67
67
|
**kwargs,
|
|
68
68
|
):
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
self._vocab = (
|
|
72
|
-
{token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
|
|
73
|
-
)
|
|
74
|
-
else:
|
|
75
|
-
# Initialize minimal vocabulary with unk token
|
|
76
|
-
self._vocab = {str(unk_token): 0}
|
|
77
|
-
|
|
78
|
-
# Initialize merges
|
|
79
|
-
if merges is not None:
|
|
80
|
-
self._merges = merges if merges is not None else generate_merges(self._vocab)
|
|
81
|
-
else:
|
|
82
|
-
self._merges = []
|
|
69
|
+
self._vocab = vocab if vocab is not None else {str(unk_token): 0}
|
|
70
|
+
self._merges = merges or []
|
|
83
71
|
|
|
84
|
-
# Create BPE tokenizer
|
|
85
72
|
self._tokenizer = Tokenizer(
|
|
86
73
|
BPE(
|
|
87
74
|
vocab=self._vocab,
|
|
@@ -107,34 +94,11 @@ class OpenAIGPTTokenizer(TokenizersBackend):
|
|
|
107
94
|
self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
|
108
95
|
self._tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")
|
|
109
96
|
|
|
110
|
-
tokenizer_object = self._tokenizer
|
|
111
|
-
|
|
112
97
|
super().__init__(
|
|
113
|
-
tokenizer_object=tokenizer_object,
|
|
114
98
|
unk_token=unk_token,
|
|
115
99
|
**kwargs,
|
|
116
100
|
)
|
|
117
101
|
|
|
118
|
-
self.vocab_file = vocab_file
|
|
119
|
-
self.merges_file = merges_file
|
|
120
|
-
|
|
121
|
-
def _post_init(self):
|
|
122
|
-
"""Post-initialization to ensure tokenizer settings are applied correctly."""
|
|
123
|
-
# Re-apply settings to ensure they're correct after loading from pretrained
|
|
124
|
-
self._tokenizer.normalizer = normalizers.Sequence(
|
|
125
|
-
[
|
|
126
|
-
normalizers.NFD(),
|
|
127
|
-
normalizers.Lowercase(),
|
|
128
|
-
normalizers.StripAccents(),
|
|
129
|
-
]
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
|
133
|
-
self._tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")
|
|
134
|
-
|
|
135
|
-
# Call parent to handle AddedToken properties
|
|
136
|
-
super()._post_init()
|
|
137
|
-
|
|
138
102
|
@property
|
|
139
103
|
def do_lower_case(self):
|
|
140
104
|
return True
|
|
@@ -836,6 +836,7 @@ class OPTForSequenceClassification(OPTPreTrainedModel):
|
|
|
836
836
|
output_hidden_states: Optional[bool] = None,
|
|
837
837
|
return_dict: Optional[bool] = None,
|
|
838
838
|
position_ids: Optional[torch.LongTensor] = None,
|
|
839
|
+
**kwargs,
|
|
839
840
|
) -> Union[tuple, SequenceClassifierOutputWithPast]:
|
|
840
841
|
r"""
|
|
841
842
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -947,6 +948,7 @@ class OPTForQuestionAnswering(OPTPreTrainedModel):
|
|
|
947
948
|
output_hidden_states: Optional[bool] = None,
|
|
948
949
|
return_dict: Optional[bool] = None,
|
|
949
950
|
position_ids: Optional[torch.LongTensor] = None,
|
|
951
|
+
**kwargs,
|
|
950
952
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
951
953
|
r"""
|
|
952
954
|
Example:
|
|
@@ -213,7 +213,6 @@ class Ovis2ImageProcessorFast(BaseImageProcessorFast):
|
|
|
213
213
|
processed_images_grouped[shape] = stacked_images
|
|
214
214
|
|
|
215
215
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
216
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
217
216
|
return BatchFeature(data={"pixel_values": processed_images, "grids": grids}, tensor_type=return_tensors)
|
|
218
217
|
|
|
219
218
|
|
|
@@ -27,6 +27,7 @@ from typing import Optional, Union
|
|
|
27
27
|
import torch
|
|
28
28
|
from torch import nn
|
|
29
29
|
|
|
30
|
+
from ... import initialization as init
|
|
30
31
|
from ...activations import ACT2FN
|
|
31
32
|
from ...cache_utils import Cache
|
|
32
33
|
from ...generation import GenerationMixin
|
|
@@ -430,6 +431,11 @@ class Ovis2PreTrainedModel(PreTrainedModel):
|
|
|
430
431
|
_can_compile_fullgraph = True
|
|
431
432
|
_supports_attention_backend = True
|
|
432
433
|
|
|
434
|
+
def _init_weights(self, module):
|
|
435
|
+
super()._init_weights(module)
|
|
436
|
+
if isinstance(module, Ovis2VisionEmbeddings):
|
|
437
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
438
|
+
|
|
433
439
|
|
|
434
440
|
def hard_softmax(logits: torch.Tensor, dim: int):
|
|
435
441
|
y_soft = logits.softmax(dim)
|
|
@@ -457,6 +463,8 @@ class Ovis2VisionModel(Ovis2PreTrainedModel):
|
|
|
457
463
|
)
|
|
458
464
|
self.head_norm = nn.LayerNorm(self.vocab_size - self.num_visual_indicator_tokens)
|
|
459
465
|
|
|
466
|
+
self.post_init()
|
|
467
|
+
|
|
460
468
|
def forward(self, pixel_values: torch.FloatTensor, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
|
|
461
469
|
outputs = self.transformer(pixel_values, **kwargs)
|
|
462
470
|
last_hidden_state = outputs[0]
|
|
@@ -780,6 +788,7 @@ class Ovis2ForConditionalGeneration(Ovis2PreTrainedModel, GenerationMixin):
|
|
|
780
788
|
attention_mask=None,
|
|
781
789
|
cache_position=None,
|
|
782
790
|
logits_to_keep=None,
|
|
791
|
+
is_first_iteration=False,
|
|
783
792
|
**kwargs,
|
|
784
793
|
):
|
|
785
794
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -791,12 +800,15 @@ class Ovis2ForConditionalGeneration(Ovis2PreTrainedModel, GenerationMixin):
|
|
|
791
800
|
attention_mask=attention_mask,
|
|
792
801
|
cache_position=cache_position,
|
|
793
802
|
logits_to_keep=logits_to_keep,
|
|
803
|
+
is_first_iteration=is_first_iteration,
|
|
794
804
|
**kwargs,
|
|
795
805
|
)
|
|
796
806
|
|
|
797
|
-
if
|
|
798
|
-
#
|
|
799
|
-
#
|
|
807
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
808
|
+
# Pixel values are used only in the first iteration if available
|
|
809
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
810
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
811
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
800
812
|
model_inputs["pixel_values"] = pixel_values
|
|
801
813
|
|
|
802
814
|
return model_inputs
|
|
@@ -19,6 +19,7 @@ from typing import Optional, Union
|
|
|
19
19
|
import torch
|
|
20
20
|
from torch import nn
|
|
21
21
|
|
|
22
|
+
from ... import initialization as init
|
|
22
23
|
from ...cache_utils import Cache
|
|
23
24
|
from ...generation import GenerationMixin
|
|
24
25
|
from ...modeling_outputs import BaseModelOutput
|
|
@@ -159,6 +160,11 @@ class Ovis2PreTrainedModel(PreTrainedModel):
|
|
|
159
160
|
_can_compile_fullgraph = True
|
|
160
161
|
_supports_attention_backend = True
|
|
161
162
|
|
|
163
|
+
def _init_weights(self, module):
|
|
164
|
+
super()._init_weights(module)
|
|
165
|
+
if isinstance(module, Ovis2VisionEmbeddings):
|
|
166
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
167
|
+
|
|
162
168
|
|
|
163
169
|
class Ovis2VisionModel(Ovis2PreTrainedModel):
|
|
164
170
|
config: Ovis2VisionConfig
|
|
@@ -176,6 +182,8 @@ class Ovis2VisionModel(Ovis2PreTrainedModel):
|
|
|
176
182
|
)
|
|
177
183
|
self.head_norm = nn.LayerNorm(self.vocab_size - self.num_visual_indicator_tokens)
|
|
178
184
|
|
|
185
|
+
self.post_init()
|
|
186
|
+
|
|
179
187
|
def forward(self, pixel_values: torch.FloatTensor, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
|
|
180
188
|
outputs = self.transformer(pixel_values, **kwargs)
|
|
181
189
|
last_hidden_state = outputs[0]
|
|
@@ -336,8 +336,6 @@ class Owlv2ImageProcessorFast(BaseImageProcessorFast):
|
|
|
336
336
|
|
|
337
337
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
338
338
|
|
|
339
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
340
|
-
|
|
341
339
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
342
340
|
|
|
343
341
|
|
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
"""PyTorch OWLv2 model."""
|
|
16
16
|
|
|
17
17
|
from dataclasses import dataclass
|
|
18
|
-
from functools import lru_cache
|
|
19
18
|
from typing import Any, Optional, Union
|
|
20
19
|
|
|
21
20
|
import torch
|
|
@@ -575,10 +574,12 @@ class Owlv2PreTrainedModel(PreTrainedModel):
|
|
|
575
574
|
if isinstance(module, Owlv2TextEmbeddings):
|
|
576
575
|
init.normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02)
|
|
577
576
|
init.normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02)
|
|
577
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
578
578
|
elif isinstance(module, Owlv2VisionEmbeddings):
|
|
579
579
|
init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
|
|
580
580
|
init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
|
|
581
581
|
init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
|
|
582
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
582
583
|
elif isinstance(module, Owlv2Attention):
|
|
583
584
|
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
|
|
584
585
|
out_proj_std = (module.embed_dim**-0.5) * factor
|
|
@@ -601,6 +602,8 @@ class Owlv2PreTrainedModel(PreTrainedModel):
|
|
|
601
602
|
std=module.vision_embed_dim**-0.5 * factor,
|
|
602
603
|
)
|
|
603
604
|
init.constant_(module.logit_scale, self.config.logit_scale_init_value)
|
|
605
|
+
elif isinstance(module, Owlv2ForObjectDetection):
|
|
606
|
+
init.copy_(module.box_bias, module.compute_box_bias(module.num_patches_height, module.num_patches_width))
|
|
604
607
|
if isinstance(module, nn.LayerNorm):
|
|
605
608
|
init.zeros_(module.bias)
|
|
606
609
|
init.ones_(module.weight)
|
|
@@ -793,6 +796,7 @@ class Owlv2TextModel(Owlv2PreTrainedModel):
|
|
|
793
796
|
output_attentions: Optional[bool] = None,
|
|
794
797
|
output_hidden_states: Optional[bool] = None,
|
|
795
798
|
return_dict: Optional[bool] = None,
|
|
799
|
+
**kwargs,
|
|
796
800
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
797
801
|
r"""
|
|
798
802
|
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
|
|
@@ -903,6 +907,7 @@ class Owlv2VisionModel(Owlv2PreTrainedModel):
|
|
|
903
907
|
output_hidden_states: Optional[bool] = None,
|
|
904
908
|
interpolate_pos_encoding: bool = False,
|
|
905
909
|
return_dict: Optional[bool] = None,
|
|
910
|
+
**kwargs,
|
|
906
911
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
907
912
|
r"""
|
|
908
913
|
Examples:
|
|
@@ -1052,6 +1057,7 @@ class Owlv2Model(Owlv2PreTrainedModel):
|
|
|
1052
1057
|
interpolate_pos_encoding: bool = False,
|
|
1053
1058
|
return_base_image_embeds: Optional[bool] = None,
|
|
1054
1059
|
return_dict: Optional[bool] = None,
|
|
1060
|
+
**kwargs,
|
|
1055
1061
|
) -> Union[tuple, Owlv2Output]:
|
|
1056
1062
|
r"""
|
|
1057
1063
|
return_loss (`bool`, *optional*):
|
|
@@ -1219,7 +1225,9 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel):
|
|
|
1219
1225
|
self.config = config
|
|
1220
1226
|
self.num_patches_height = self.config.vision_config.image_size // self.config.vision_config.patch_size
|
|
1221
1227
|
self.num_patches_width = self.config.vision_config.image_size // self.config.vision_config.patch_size
|
|
1222
|
-
self.
|
|
1228
|
+
self.register_buffer(
|
|
1229
|
+
"box_bias", self.compute_box_bias(self.num_patches_height, self.num_patches_width), persistent=False
|
|
1230
|
+
)
|
|
1223
1231
|
|
|
1224
1232
|
# Initialize weights and apply final processing
|
|
1225
1233
|
self.post_init()
|
|
@@ -1256,7 +1264,6 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel):
|
|
|
1256
1264
|
objectness_logits = objectness_logits[..., 0]
|
|
1257
1265
|
return objectness_logits
|
|
1258
1266
|
|
|
1259
|
-
@lru_cache(maxsize=2)
|
|
1260
1267
|
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.compute_box_bias
|
|
1261
1268
|
def compute_box_bias(
|
|
1262
1269
|
self, num_patches_height: int, num_patches_width: int, feature_map: Optional[torch.FloatTensor] = None
|
|
@@ -1602,6 +1609,7 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel):
|
|
|
1602
1609
|
output_hidden_states: Optional[bool] = None,
|
|
1603
1610
|
interpolate_pos_encoding: bool = False,
|
|
1604
1611
|
return_dict: Optional[bool] = None,
|
|
1612
|
+
**kwargs,
|
|
1605
1613
|
) -> Owlv2ObjectDetectionOutput:
|
|
1606
1614
|
r"""
|
|
1607
1615
|
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
|
|
@@ -205,8 +205,6 @@ class Owlv2ImageProcessorFast(OwlViTImageProcessorFast):
|
|
|
205
205
|
|
|
206
206
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
207
207
|
|
|
208
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
209
|
-
|
|
210
208
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
211
209
|
|
|
212
210
|
|
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
"""PyTorch OWL-ViT model."""
|
|
16
16
|
|
|
17
17
|
from dataclasses import dataclass
|
|
18
|
-
from functools import lru_cache
|
|
19
18
|
from typing import Any, Optional, Union
|
|
20
19
|
|
|
21
20
|
import torch
|
|
@@ -562,10 +561,12 @@ class OwlViTPreTrainedModel(PreTrainedModel):
|
|
|
562
561
|
if isinstance(module, OwlViTTextEmbeddings):
|
|
563
562
|
init.normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02)
|
|
564
563
|
init.normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02)
|
|
564
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
565
565
|
elif isinstance(module, OwlViTVisionEmbeddings):
|
|
566
566
|
init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
|
|
567
567
|
init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
|
|
568
568
|
init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
|
|
569
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
569
570
|
elif isinstance(module, OwlViTAttention):
|
|
570
571
|
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
|
|
571
572
|
out_proj_std = (module.embed_dim**-0.5) * factor
|
|
@@ -588,6 +589,8 @@ class OwlViTPreTrainedModel(PreTrainedModel):
|
|
|
588
589
|
std=module.vision_embed_dim**-0.5 * factor,
|
|
589
590
|
)
|
|
590
591
|
init.constant_(module.logit_scale, self.config.logit_scale_init_value)
|
|
592
|
+
elif isinstance(module, OwlViTForObjectDetection):
|
|
593
|
+
init.copy_(module.box_bias, module.compute_box_bias(module.num_patches_height, module.num_patches_width))
|
|
591
594
|
if isinstance(module, nn.LayerNorm):
|
|
592
595
|
init.zeros_(module.bias)
|
|
593
596
|
init.ones_(module.weight)
|
|
@@ -777,6 +780,7 @@ class OwlViTTextModel(OwlViTPreTrainedModel):
|
|
|
777
780
|
output_attentions: Optional[bool] = None,
|
|
778
781
|
output_hidden_states: Optional[bool] = None,
|
|
779
782
|
return_dict: Optional[bool] = None,
|
|
783
|
+
**kwargs,
|
|
780
784
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
781
785
|
r"""
|
|
782
786
|
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
|
|
@@ -885,6 +889,7 @@ class OwlViTVisionModel(OwlViTPreTrainedModel):
|
|
|
885
889
|
output_hidden_states: Optional[bool] = None,
|
|
886
890
|
interpolate_pos_encoding: bool = False,
|
|
887
891
|
return_dict: Optional[bool] = None,
|
|
892
|
+
**kwargs,
|
|
888
893
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
889
894
|
r"""
|
|
890
895
|
Examples:
|
|
@@ -1033,6 +1038,7 @@ class OwlViTModel(OwlViTPreTrainedModel):
|
|
|
1033
1038
|
interpolate_pos_encoding: bool = False,
|
|
1034
1039
|
return_base_image_embeds: Optional[bool] = None,
|
|
1035
1040
|
return_dict: Optional[bool] = None,
|
|
1041
|
+
**kwargs,
|
|
1036
1042
|
) -> Union[tuple, OwlViTOutput]:
|
|
1037
1043
|
r"""
|
|
1038
1044
|
return_loss (`bool`, *optional*):
|
|
@@ -1197,7 +1203,9 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
|
|
|
1197
1203
|
self.config = config
|
|
1198
1204
|
self.num_patches_height = self.config.vision_config.image_size // self.config.vision_config.patch_size
|
|
1199
1205
|
self.num_patches_width = self.config.vision_config.image_size // self.config.vision_config.patch_size
|
|
1200
|
-
self.
|
|
1206
|
+
self.register_buffer(
|
|
1207
|
+
"box_bias", self.compute_box_bias(self.num_patches_height, self.num_patches_width), persistent=False
|
|
1208
|
+
)
|
|
1201
1209
|
|
|
1202
1210
|
self.post_init()
|
|
1203
1211
|
|
|
@@ -1218,7 +1226,6 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
|
|
|
1218
1226
|
|
|
1219
1227
|
return box_coordinates
|
|
1220
1228
|
|
|
1221
|
-
@lru_cache(maxsize=2)
|
|
1222
1229
|
def compute_box_bias(
|
|
1223
1230
|
self, num_patches_height: int, num_patches_width: int, feature_map: Optional[torch.FloatTensor] = None
|
|
1224
1231
|
) -> torch.Tensor:
|
|
@@ -1543,6 +1550,7 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
|
|
|
1543
1550
|
output_hidden_states: Optional[bool] = None,
|
|
1544
1551
|
interpolate_pos_encoding: bool = False,
|
|
1545
1552
|
return_dict: Optional[bool] = None,
|
|
1553
|
+
**kwargs,
|
|
1546
1554
|
) -> OwlViTObjectDetectionOutput:
|
|
1547
1555
|
r"""
|
|
1548
1556
|
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025 the HuggingFace Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
from typing import TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
from ...utils import _LazyModule
|
|
19
|
+
from ...utils.import_utils import define_import_structure
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from .configuration_paddleocr_vl import *
|
|
24
|
+
from .image_processing_paddleocr_vl import *
|
|
25
|
+
from .image_processing_paddleocr_vl_fast import *
|
|
26
|
+
from .modeling_paddleocr_vl import *
|
|
27
|
+
from .processing_paddleocr_vl import *
|
|
28
|
+
else:
|
|
29
|
+
import sys
|
|
30
|
+
|
|
31
|
+
_file = globals()["__file__"]
|
|
32
|
+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|