transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +20 -1
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +68 -5
- transformers/core_model_loading.py +201 -35
- transformers/dependency_versions_table.py +1 -1
- transformers/feature_extraction_utils.py +54 -22
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +162 -122
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +101 -64
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +2 -12
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +12 -0
- transformers/integrations/accelerate.py +44 -111
- transformers/integrations/aqlm.py +3 -5
- transformers/integrations/awq.py +2 -5
- transformers/integrations/bitnet.py +5 -8
- transformers/integrations/bitsandbytes.py +16 -15
- transformers/integrations/deepspeed.py +18 -3
- transformers/integrations/eetq.py +3 -5
- transformers/integrations/fbgemm_fp8.py +1 -1
- transformers/integrations/finegrained_fp8.py +6 -16
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/higgs.py +2 -5
- transformers/integrations/hub_kernels.py +23 -5
- transformers/integrations/integration_utils.py +35 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +4 -10
- transformers/integrations/peft.py +5 -0
- transformers/integrations/quanto.py +5 -2
- transformers/integrations/spqr.py +3 -5
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/vptq.py +3 -5
- transformers/modeling_gguf_pytorch_utils.py +66 -19
- transformers/modeling_rope_utils.py +78 -81
- transformers/modeling_utils.py +583 -503
- transformers/models/__init__.py +19 -0
- transformers/models/afmoe/modeling_afmoe.py +7 -16
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/align/modeling_align.py +12 -6
- transformers/models/altclip/modeling_altclip.py +7 -3
- transformers/models/apertus/modeling_apertus.py +4 -2
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +8 -4
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +27 -0
- transformers/models/auto/feature_extraction_auto.py +7 -3
- transformers/models/auto/image_processing_auto.py +4 -2
- transformers/models/auto/modeling_auto.py +31 -0
- transformers/models/auto/processing_auto.py +4 -0
- transformers/models/auto/tokenization_auto.py +132 -153
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +18 -19
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +9 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +3 -0
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
- transformers/models/bit/modeling_bit.py +5 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blenderbot/modeling_blenderbot.py +7 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +8 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -0
- transformers/models/bloom/modeling_bloom.py +13 -44
- transformers/models/blt/modeling_blt.py +162 -2
- transformers/models/blt/modular_blt.py +168 -3
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +6 -0
- transformers/models/bros/modeling_bros.py +8 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/canine/modeling_canine.py +6 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +9 -4
- transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +25 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clipseg/modeling_clipseg.py +4 -0
- transformers/models/clvp/modeling_clvp.py +14 -3
- transformers/models/code_llama/tokenization_code_llama.py +1 -1
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
- transformers/models/convbert/modeling_convbert.py +3 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +3 -1
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +14 -2
- transformers/models/cvt/modeling_cvt.py +5 -1
- transformers/models/cwm/modeling_cwm.py +1 -1
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +46 -39
- transformers/models/d_fine/modular_d_fine.py +15 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +1 -1
- transformers/models/dac/modeling_dac.py +4 -4
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +1 -1
- transformers/models/deberta/modeling_deberta.py +2 -0
- transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
- transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
- transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +8 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +12 -1
- transformers/models/dia/modular_dia.py +11 -0
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +3 -3
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
- transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +16 -12
- transformers/models/dots1/modeling_dots1.py +14 -5
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +5 -2
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +5 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +8 -2
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt_fast.py +46 -14
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +6 -1
- transformers/models/evolla/modeling_evolla.py +9 -1
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/falcon/modeling_falcon.py +3 -3
- transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
- transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
- transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +14 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +4 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
- transformers/models/florence2/modeling_florence2.py +20 -3
- transformers/models/florence2/modular_florence2.py +13 -0
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +16 -0
- transformers/models/gemma/modeling_gemma.py +10 -12
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma2/modular_gemma2.py +1 -1
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +28 -7
- transformers/models/gemma3/modular_gemma3.py +26 -6
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +47 -9
- transformers/models/gemma3n/modular_gemma3n.py +51 -9
- transformers/models/git/modeling_git.py +181 -126
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +15 -5
- transformers/models/glm4v/modular_glm4v.py +11 -3
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
- transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +8 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
- transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
- transformers/models/gptj/modeling_gptj.py +15 -6
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +2 -3
- transformers/models/granitemoe/modular_granitemoe.py +1 -2
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
- transformers/models/groupvit/modeling_groupvit.py +6 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
- transformers/models/hubert/modeling_hubert.py +4 -0
- transformers/models/hubert/modular_hubert.py +4 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +16 -0
- transformers/models/idefics/modeling_idefics.py +10 -0
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +9 -2
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +11 -8
- transformers/models/internvl/modular_internvl.py +5 -9
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +24 -19
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +15 -7
- transformers/models/janus/modular_janus.py +16 -7
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +14 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/configuration_lasr.py +4 -0
- transformers/models/lasr/modeling_lasr.py +3 -2
- transformers/models/lasr/modular_lasr.py +8 -1
- transformers/models/lasr/processing_lasr.py +0 -2
- transformers/models/layoutlm/modeling_layoutlm.py +5 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +18 -0
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lilt/modeling_lilt.py +19 -15
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +8 -4
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
- transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
- transformers/models/longt5/modeling_longt5.py +0 -4
- transformers/models/m2m_100/modeling_m2m_100.py +10 -0
- transformers/models/mamba/modeling_mamba.py +2 -1
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +3 -0
- transformers/models/markuplm/modeling_markuplm.py +5 -8
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +9 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +9 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mimi/modeling_mimi.py +25 -4
- transformers/models/minimax/modeling_minimax.py +16 -3
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +12 -4
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +13 -2
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +4 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
- transformers/models/modernbert/modeling_modernbert.py +12 -1
- transformers/models/modernbert/modular_modernbert.py +12 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
- transformers/models/moonshine/modeling_moonshine.py +1 -1
- transformers/models/moshi/modeling_moshi.py +21 -51
- transformers/models/mpnet/modeling_mpnet.py +2 -0
- transformers/models/mra/modeling_mra.py +4 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +0 -10
- transformers/models/musicgen/modeling_musicgen.py +5 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +1 -1
- transformers/models/nemotron/modeling_nemotron.py +3 -3
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +11 -16
- transformers/models/nystromformer/modeling_nystromformer.py +7 -0
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/olmoe/modeling_olmoe.py +12 -4
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +7 -38
- transformers/models/openai/modeling_openai.py +12 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +7 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +7 -3
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/modeling_parakeet.py +5 -0
- transformers/models/parakeet/modular_parakeet.py +5 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
- transformers/models/patchtst/modeling_patchtst.py +5 -4
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/models/pe_audio/processing_pe_audio.py +24 -0
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +3 -0
- transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +5 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +12 -4
- transformers/models/phimoe/modular_phimoe.py +1 -1
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +1 -1
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +7 -0
- transformers/models/plbart/modular_plbart.py +6 -0
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +11 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prophetnet/modeling_prophetnet.py +2 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
- transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +7 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
- transformers/models/reformer/modeling_reformer.py +9 -1
- transformers/models/regnet/modeling_regnet.py +4 -0
- transformers/models/rembert/modeling_rembert.py +7 -1
- transformers/models/resnet/modeling_resnet.py +8 -3
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +4 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +1 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +5 -1
- transformers/models/sam2/modular_sam2.py +5 -1
- transformers/models/sam2_video/modeling_sam2_video.py +51 -43
- transformers/models/sam2_video/modular_sam2_video.py +31 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +23 -0
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +3 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +2 -2
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +63 -41
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
- transformers/models/speecht5/modeling_speecht5.py +28 -0
- transformers/models/splinter/modeling_splinter.py +9 -3
- transformers/models/squeezebert/modeling_squeezebert.py +2 -0
- transformers/models/stablelm/modeling_stablelm.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/swiftformer/modeling_swiftformer.py +4 -0
- transformers/models/swin/modeling_swin.py +16 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +49 -33
- transformers/models/swinv2/modeling_swinv2.py +41 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +1 -7
- transformers/models/t5gemma/modeling_t5gemma.py +1 -1
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +1 -1
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +12 -0
- transformers/models/timesfm/modular_timesfm.py +12 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
- transformers/models/trocr/modeling_trocr.py +1 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +4 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +3 -7
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +0 -6
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +7 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/visual_bert/modeling_visual_bert.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +4 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +5 -3
- transformers/models/x_clip/modeling_x_clip.py +2 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +10 -0
- transformers/models/xlm/modeling_xlm.py +13 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +4 -1
- transformers/models/zamba/modeling_zamba.py +2 -1
- transformers/models/zamba2/modeling_zamba2.py +3 -2
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +7 -0
- transformers/pipelines/__init__.py +9 -6
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/document_question_answering.py +1 -1
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +127 -56
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +9 -64
- transformers/quantizers/quantizer_aqlm.py +1 -18
- transformers/quantizers/quantizer_auto_round.py +1 -10
- transformers/quantizers/quantizer_awq.py +3 -8
- transformers/quantizers/quantizer_bitnet.py +1 -6
- transformers/quantizers/quantizer_bnb_4bit.py +9 -49
- transformers/quantizers/quantizer_bnb_8bit.py +9 -19
- transformers/quantizers/quantizer_compressed_tensors.py +1 -4
- transformers/quantizers/quantizer_eetq.py +2 -12
- transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
- transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
- transformers/quantizers/quantizer_fp_quant.py +4 -4
- transformers/quantizers/quantizer_gptq.py +1 -4
- transformers/quantizers/quantizer_higgs.py +2 -6
- transformers/quantizers/quantizer_mxfp4.py +2 -28
- transformers/quantizers/quantizer_quanto.py +14 -14
- transformers/quantizers/quantizer_spqr.py +3 -8
- transformers/quantizers/quantizer_torchao.py +28 -124
- transformers/quantizers/quantizer_vptq.py +1 -10
- transformers/testing_utils.py +28 -12
- transformers/tokenization_mistral_common.py +3 -2
- transformers/tokenization_utils_base.py +3 -2
- transformers/tokenization_utils_tokenizers.py +25 -2
- transformers/trainer.py +24 -2
- transformers/trainer_callback.py +8 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/training_args.py +8 -10
- transformers/utils/__init__.py +4 -0
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +34 -25
- transformers/utils/generic.py +20 -0
- transformers/utils/import_utils.py +51 -9
- transformers/utils/kernel_config.py +71 -18
- transformers/utils/quantization_config.py +8 -8
- transformers/video_processing_utils.py +16 -12
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -21,6 +21,7 @@ import torch
|
|
|
21
21
|
from torch import nn
|
|
22
22
|
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
23
23
|
|
|
24
|
+
from ... import initialization as init
|
|
24
25
|
from ...activations import ACT2FN
|
|
25
26
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
26
27
|
from ...modeling_outputs import (
|
|
@@ -413,6 +414,12 @@ class NystromformerPreTrainedModel(PreTrainedModel):
|
|
|
413
414
|
base_model_prefix = "nystromformer"
|
|
414
415
|
supports_gradient_checkpointing = True
|
|
415
416
|
|
|
417
|
+
def _init_weights(self, module):
|
|
418
|
+
super()._init_weights(module)
|
|
419
|
+
if isinstance(module, NystromformerEmbeddings):
|
|
420
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)) + 2)
|
|
421
|
+
init.zeros_(module.token_type_ids)
|
|
422
|
+
|
|
416
423
|
|
|
417
424
|
@auto_docstring
|
|
418
425
|
class NystromformerModel(NystromformerPreTrainedModel):
|
|
@@ -93,7 +93,7 @@ class OlmoRotaryEmbedding(nn.Module):
|
|
|
93
93
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
94
94
|
|
|
95
95
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
96
|
-
self.original_inv_freq =
|
|
96
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
97
97
|
|
|
98
98
|
@staticmethod
|
|
99
99
|
def compute_default_rope_parameters(
|
|
@@ -85,7 +85,7 @@ class Olmo2RotaryEmbedding(nn.Module):
|
|
|
85
85
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
86
86
|
|
|
87
87
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
88
|
-
self.original_inv_freq =
|
|
88
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
89
89
|
|
|
90
90
|
@staticmethod
|
|
91
91
|
def compute_default_rope_parameters(
|
|
@@ -293,7 +293,7 @@ class Olmo3RotaryEmbedding(nn.Module):
|
|
|
293
293
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
294
294
|
|
|
295
295
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
296
|
-
self.original_inv_freq =
|
|
296
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
297
297
|
|
|
298
298
|
@staticmethod
|
|
299
299
|
def compute_default_rope_parameters(
|
|
@@ -27,14 +27,19 @@ from ... import initialization as init
|
|
|
27
27
|
from ...activations import ACT2FN
|
|
28
28
|
from ...cache_utils import Cache, DynamicCache
|
|
29
29
|
from ...generation import GenerationMixin
|
|
30
|
-
from ...integrations import
|
|
30
|
+
from ...integrations import (
|
|
31
|
+
use_experts_implementation,
|
|
32
|
+
use_kernel_forward_from_hub,
|
|
33
|
+
use_kernel_func_from_hub,
|
|
34
|
+
use_kernelized_func,
|
|
35
|
+
)
|
|
31
36
|
from ...masking_utils import create_causal_mask
|
|
32
37
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
33
38
|
from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
|
|
34
39
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
35
40
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
36
41
|
from ...processing_utils import Unpack
|
|
37
|
-
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
42
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
|
|
38
43
|
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
|
|
39
44
|
from .configuration_olmoe import OlmoeConfig
|
|
40
45
|
|
|
@@ -77,7 +82,7 @@ class OlmoeRotaryEmbedding(nn.Module):
|
|
|
77
82
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
78
83
|
|
|
79
84
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
80
|
-
self.original_inv_freq =
|
|
85
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
81
86
|
|
|
82
87
|
@staticmethod
|
|
83
88
|
def compute_default_rope_parameters(
|
|
@@ -298,6 +303,7 @@ class OlmoeAttention(nn.Module):
|
|
|
298
303
|
return attn_output, attn_weights
|
|
299
304
|
|
|
300
305
|
|
|
306
|
+
@use_experts_implementation
|
|
301
307
|
class OlmoeExperts(nn.Module):
|
|
302
308
|
"""Collection of expert weights stored as 3D tensors."""
|
|
303
309
|
|
|
@@ -431,7 +437,9 @@ class OlmoePreTrainedModel(PreTrainedModel):
|
|
|
431
437
|
"hidden_states": OlmoeDecoderLayer,
|
|
432
438
|
"attentions": OlmoeAttention,
|
|
433
439
|
}
|
|
434
|
-
_can_compile_fullgraph =
|
|
440
|
+
_can_compile_fullgraph = (
|
|
441
|
+
is_grouped_mm_available()
|
|
442
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
435
443
|
_supports_attention_backend = True
|
|
436
444
|
|
|
437
445
|
@torch.no_grad()
|
|
@@ -24,7 +24,7 @@ from ...masking_utils import create_causal_mask
|
|
|
24
24
|
from ...modeling_outputs import MoeModelOutputWithPast
|
|
25
25
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
26
26
|
from ...processing_utils import Unpack
|
|
27
|
-
from ...utils import TransformersKwargs, auto_docstring, logging
|
|
27
|
+
from ...utils import TransformersKwargs, auto_docstring, is_grouped_mm_available, logging
|
|
28
28
|
from ...utils.generic import OutputRecorder
|
|
29
29
|
from ..gemma.modeling_gemma import GemmaMLP
|
|
30
30
|
from ..llama.modeling_llama import (
|
|
@@ -165,7 +165,9 @@ class OlmoePreTrainedModel(PreTrainedModel):
|
|
|
165
165
|
"hidden_states": OlmoeDecoderLayer,
|
|
166
166
|
"attentions": OlmoeAttention,
|
|
167
167
|
}
|
|
168
|
-
_can_compile_fullgraph =
|
|
168
|
+
_can_compile_fullgraph = (
|
|
169
|
+
is_grouped_mm_available()
|
|
170
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
169
171
|
_supports_attention_backend = True
|
|
170
172
|
|
|
171
173
|
@torch.no_grad()
|
|
@@ -36,7 +36,7 @@ class OmDetTurboConfig(PreTrainedConfig):
|
|
|
36
36
|
Args:
|
|
37
37
|
text_config (`PreTrainedConfig`, *optional*):
|
|
38
38
|
The configuration of the text backbone.
|
|
39
|
-
backbone_config (`PreTrainedConfig`, *optional
|
|
39
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
|
|
40
40
|
The configuration of the vision backbone.
|
|
41
41
|
use_timm_backbone (`bool`, *optional*, defaults to `True`):
|
|
42
42
|
Whether to use the timm for the vision backbone.
|
|
@@ -68,7 +68,7 @@ class OmDetTurboConfig(PreTrainedConfig):
|
|
|
68
68
|
class_embed_dim (`int`, *optional*, defaults to 512):
|
|
69
69
|
The dimension of the classes embeddings.
|
|
70
70
|
class_distance_type (`str`, *optional*, defaults to `"cosine"`):
|
|
71
|
-
The type of
|
|
71
|
+
The type of distance to compare predicted classes to projected classes embeddings.
|
|
72
72
|
Can be `"cosine"` or `"dot"`.
|
|
73
73
|
num_queries (`int`, *optional*, defaults to 900):
|
|
74
74
|
The number of queries.
|
|
@@ -1022,6 +1022,10 @@ class OmDetTurboPreTrainedModel(PreTrainedModel):
|
|
|
1022
1022
|
elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
|
|
1023
1023
|
init.ones_(module.weight)
|
|
1024
1024
|
init.zeros_(module.bias)
|
|
1025
|
+
if getattr(module, "running_mean", None) is not None:
|
|
1026
|
+
init.zeros_(module.running_mean)
|
|
1027
|
+
init.ones_(module.running_var)
|
|
1028
|
+
init.zeros_(module.num_batches_tracked)
|
|
1025
1029
|
|
|
1026
1030
|
def _set_gradient_checkpointing(self, module, value=False):
|
|
1027
1031
|
if isinstance(module, OmDetTurboDecoder):
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""OneFormer model configuration"""
|
|
16
16
|
|
|
17
|
-
from typing import Optional
|
|
17
|
+
from typing import Optional, Union
|
|
18
18
|
|
|
19
19
|
from ...configuration_utils import PreTrainedConfig
|
|
20
20
|
from ...utils import logging
|
|
@@ -37,7 +37,7 @@ class OneFormerConfig(PreTrainedConfig):
|
|
|
37
37
|
documentation from [`PreTrainedConfig`] for more information.
|
|
38
38
|
|
|
39
39
|
Args:
|
|
40
|
-
backbone_config (`PreTrainedConfig`, *optional*, defaults to `SwinConfig`):
|
|
40
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
|
|
41
41
|
The configuration of the backbone model.
|
|
42
42
|
backbone (`str`, *optional*):
|
|
43
43
|
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
@@ -151,7 +151,7 @@ class OneFormerConfig(PreTrainedConfig):
|
|
|
151
151
|
|
|
152
152
|
def __init__(
|
|
153
153
|
self,
|
|
154
|
-
backbone_config: Optional[dict] = None,
|
|
154
|
+
backbone_config: Optional[Union[dict, PreTrainedConfig]] = None,
|
|
155
155
|
backbone: Optional[str] = None,
|
|
156
156
|
use_pretrained_backbone: bool = False,
|
|
157
157
|
use_timm_backbone: bool = False,
|
|
@@ -935,44 +935,6 @@ class OneFormerForUniversalSegmentationOutput(ModelOutput):
|
|
|
935
935
|
attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
|
|
936
936
|
|
|
937
937
|
|
|
938
|
-
# Modified from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrFrozenBatchNorm2d with DeformableDetr->OneFormerPixelDecoder
|
|
939
|
-
class OneFormerPixelDecoderFrozenBatchNorm2d(nn.Module):
|
|
940
|
-
"""
|
|
941
|
-
BatchNorm2d where the batch statistics and the affine parameters are fixed.
|
|
942
|
-
|
|
943
|
-
Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
|
|
944
|
-
torchvision.models.resnet[18,34,50,101] produce nans.
|
|
945
|
-
"""
|
|
946
|
-
|
|
947
|
-
def __init__(self, n):
|
|
948
|
-
super().__init__()
|
|
949
|
-
self.register_buffer("weight", torch.ones(n))
|
|
950
|
-
self.register_buffer("bias", torch.zeros(n))
|
|
951
|
-
self.register_buffer("running_mean", torch.zeros(n))
|
|
952
|
-
self.register_buffer("running_var", torch.ones(n))
|
|
953
|
-
|
|
954
|
-
def _load_from_state_dict(
|
|
955
|
-
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
|
|
956
|
-
):
|
|
957
|
-
num_batches_tracked_key = prefix + "num_batches_tracked"
|
|
958
|
-
if num_batches_tracked_key in state_dict:
|
|
959
|
-
del state_dict[num_batches_tracked_key]
|
|
960
|
-
|
|
961
|
-
super()._load_from_state_dict(
|
|
962
|
-
state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
|
|
963
|
-
)
|
|
964
|
-
|
|
965
|
-
def forward(self, x):
|
|
966
|
-
weight = self.weight.reshape(1, -1, 1, 1)
|
|
967
|
-
bias = self.bias.reshape(1, -1, 1, 1)
|
|
968
|
-
running_var = self.running_var.reshape(1, -1, 1, 1)
|
|
969
|
-
running_mean = self.running_mean.reshape(1, -1, 1, 1)
|
|
970
|
-
epsilon = 1e-5
|
|
971
|
-
scale = weight * (running_var + epsilon).rsqrt()
|
|
972
|
-
bias = bias - running_mean * scale
|
|
973
|
-
return x * scale + bias
|
|
974
|
-
|
|
975
|
-
|
|
976
938
|
# Modified from transformers.models.detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->OneFormerPixelDecoderEncoder
|
|
977
939
|
class OneFormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
|
|
978
940
|
"""
|
|
@@ -2833,6 +2795,10 @@ class OneFormerPreTrainedModel(PreTrainedModel):
|
|
|
2833
2795
|
init.normal_(module.weight, mean=0.0, std=std)
|
|
2834
2796
|
if module.bias is not None:
|
|
2835
2797
|
init.zeros_(module.bias)
|
|
2798
|
+
if getattr(module, "running_mean", None) is not None:
|
|
2799
|
+
init.zeros_(module.running_mean)
|
|
2800
|
+
init.ones_(module.running_var)
|
|
2801
|
+
init.zeros_(module.num_batches_tracked)
|
|
2836
2802
|
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
|
|
2837
2803
|
init.ones_(module.weight)
|
|
2838
2804
|
init.zeros_(module.bias)
|
|
@@ -2843,6 +2809,9 @@ class OneFormerPreTrainedModel(PreTrainedModel):
|
|
|
2843
2809
|
init.zeros_(module.weight[module.padding_idx])
|
|
2844
2810
|
elif isinstance(module, OneFormerLoss):
|
|
2845
2811
|
init.constant_(module.logit_scale, np.log(1 / self.config.contrastive_temperature))
|
|
2812
|
+
empty_weight = torch.ones(module.num_classes + 1)
|
|
2813
|
+
empty_weight[-1] = module.eos_coef
|
|
2814
|
+
init.copy_(module.empty_weight, empty_weight)
|
|
2846
2815
|
|
|
2847
2816
|
|
|
2848
2817
|
@auto_docstring
|
|
@@ -24,6 +24,7 @@ import torch
|
|
|
24
24
|
from torch import nn
|
|
25
25
|
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
26
26
|
|
|
27
|
+
from ... import initialization as init
|
|
27
28
|
from ...activations import gelu_new, get_activation, silu
|
|
28
29
|
from ...generation import GenerationMixin
|
|
29
30
|
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
|
|
@@ -46,6 +47,7 @@ ACT_FNS = {"relu": nn.ReLU(), "silu": silu, "gelu": gelu_new, "swish": silu}
|
|
|
46
47
|
class Attention(nn.Module):
|
|
47
48
|
def __init__(self, nx, n_positions, config, scale=False):
|
|
48
49
|
super().__init__()
|
|
50
|
+
self.n_positions = n_positions
|
|
49
51
|
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
|
50
52
|
if n_state % config.n_head != 0:
|
|
51
53
|
raise ValueError(f"Attention n_state shape: {n_state} must be divisible by config.n_head {config.n_head}")
|
|
@@ -259,6 +261,16 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
|
|
259
261
|
config: OpenAIGPTConfig
|
|
260
262
|
base_model_prefix = "transformer"
|
|
261
263
|
|
|
264
|
+
def _init_weights(self, module):
|
|
265
|
+
super()._init_weights(module)
|
|
266
|
+
if isinstance(module, Attention):
|
|
267
|
+
n_positions = module.n_positions
|
|
268
|
+
init.copy_(
|
|
269
|
+
module.bias, torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions)
|
|
270
|
+
)
|
|
271
|
+
elif isinstance(module, OpenAIGPTModel):
|
|
272
|
+
init.copy_(module.position_ids, torch.arange(module.config.n_positions))
|
|
273
|
+
|
|
262
274
|
|
|
263
275
|
@dataclass
|
|
264
276
|
@auto_docstring(
|
|
@@ -213,7 +213,6 @@ class Ovis2ImageProcessorFast(BaseImageProcessorFast):
|
|
|
213
213
|
processed_images_grouped[shape] = stacked_images
|
|
214
214
|
|
|
215
215
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
216
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
217
216
|
return BatchFeature(data={"pixel_values": processed_images, "grids": grids}, tensor_type=return_tensors)
|
|
218
217
|
|
|
219
218
|
|
|
@@ -27,6 +27,7 @@ from typing import Optional, Union
|
|
|
27
27
|
import torch
|
|
28
28
|
from torch import nn
|
|
29
29
|
|
|
30
|
+
from ... import initialization as init
|
|
30
31
|
from ...activations import ACT2FN
|
|
31
32
|
from ...cache_utils import Cache
|
|
32
33
|
from ...generation import GenerationMixin
|
|
@@ -430,6 +431,11 @@ class Ovis2PreTrainedModel(PreTrainedModel):
|
|
|
430
431
|
_can_compile_fullgraph = True
|
|
431
432
|
_supports_attention_backend = True
|
|
432
433
|
|
|
434
|
+
def _init_weights(self, module):
|
|
435
|
+
super()._init_weights(module)
|
|
436
|
+
if isinstance(module, Ovis2VisionEmbeddings):
|
|
437
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
438
|
+
|
|
433
439
|
|
|
434
440
|
def hard_softmax(logits: torch.Tensor, dim: int):
|
|
435
441
|
y_soft = logits.softmax(dim)
|
|
@@ -457,6 +463,8 @@ class Ovis2VisionModel(Ovis2PreTrainedModel):
|
|
|
457
463
|
)
|
|
458
464
|
self.head_norm = nn.LayerNorm(self.vocab_size - self.num_visual_indicator_tokens)
|
|
459
465
|
|
|
466
|
+
self.post_init()
|
|
467
|
+
|
|
460
468
|
def forward(self, pixel_values: torch.FloatTensor, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
|
|
461
469
|
outputs = self.transformer(pixel_values, **kwargs)
|
|
462
470
|
last_hidden_state = outputs[0]
|
|
@@ -780,6 +788,7 @@ class Ovis2ForConditionalGeneration(Ovis2PreTrainedModel, GenerationMixin):
|
|
|
780
788
|
attention_mask=None,
|
|
781
789
|
cache_position=None,
|
|
782
790
|
logits_to_keep=None,
|
|
791
|
+
is_first_iteration=False,
|
|
783
792
|
**kwargs,
|
|
784
793
|
):
|
|
785
794
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -791,12 +800,15 @@ class Ovis2ForConditionalGeneration(Ovis2PreTrainedModel, GenerationMixin):
|
|
|
791
800
|
attention_mask=attention_mask,
|
|
792
801
|
cache_position=cache_position,
|
|
793
802
|
logits_to_keep=logits_to_keep,
|
|
803
|
+
is_first_iteration=is_first_iteration,
|
|
794
804
|
**kwargs,
|
|
795
805
|
)
|
|
796
806
|
|
|
797
|
-
if
|
|
798
|
-
#
|
|
799
|
-
#
|
|
807
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
808
|
+
# Pixel values are used only in the first iteration if available
|
|
809
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
810
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
811
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
800
812
|
model_inputs["pixel_values"] = pixel_values
|
|
801
813
|
|
|
802
814
|
return model_inputs
|
|
@@ -19,6 +19,7 @@ from typing import Optional, Union
|
|
|
19
19
|
import torch
|
|
20
20
|
from torch import nn
|
|
21
21
|
|
|
22
|
+
from ... import initialization as init
|
|
22
23
|
from ...cache_utils import Cache
|
|
23
24
|
from ...generation import GenerationMixin
|
|
24
25
|
from ...modeling_outputs import BaseModelOutput
|
|
@@ -159,6 +160,11 @@ class Ovis2PreTrainedModel(PreTrainedModel):
|
|
|
159
160
|
_can_compile_fullgraph = True
|
|
160
161
|
_supports_attention_backend = True
|
|
161
162
|
|
|
163
|
+
def _init_weights(self, module):
|
|
164
|
+
super()._init_weights(module)
|
|
165
|
+
if isinstance(module, Ovis2VisionEmbeddings):
|
|
166
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
167
|
+
|
|
162
168
|
|
|
163
169
|
class Ovis2VisionModel(Ovis2PreTrainedModel):
|
|
164
170
|
config: Ovis2VisionConfig
|
|
@@ -176,6 +182,8 @@ class Ovis2VisionModel(Ovis2PreTrainedModel):
|
|
|
176
182
|
)
|
|
177
183
|
self.head_norm = nn.LayerNorm(self.vocab_size - self.num_visual_indicator_tokens)
|
|
178
184
|
|
|
185
|
+
self.post_init()
|
|
186
|
+
|
|
179
187
|
def forward(self, pixel_values: torch.FloatTensor, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
|
|
180
188
|
outputs = self.transformer(pixel_values, **kwargs)
|
|
181
189
|
last_hidden_state = outputs[0]
|
|
@@ -336,8 +336,6 @@ class Owlv2ImageProcessorFast(BaseImageProcessorFast):
|
|
|
336
336
|
|
|
337
337
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
338
338
|
|
|
339
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
340
|
-
|
|
341
339
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
342
340
|
|
|
343
341
|
|
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
"""PyTorch OWLv2 model."""
|
|
16
16
|
|
|
17
17
|
from dataclasses import dataclass
|
|
18
|
-
from functools import lru_cache
|
|
19
18
|
from typing import Any, Optional, Union
|
|
20
19
|
|
|
21
20
|
import torch
|
|
@@ -575,10 +574,12 @@ class Owlv2PreTrainedModel(PreTrainedModel):
|
|
|
575
574
|
if isinstance(module, Owlv2TextEmbeddings):
|
|
576
575
|
init.normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02)
|
|
577
576
|
init.normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02)
|
|
577
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
578
578
|
elif isinstance(module, Owlv2VisionEmbeddings):
|
|
579
579
|
init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
|
|
580
580
|
init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
|
|
581
581
|
init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
|
|
582
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
582
583
|
elif isinstance(module, Owlv2Attention):
|
|
583
584
|
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
|
|
584
585
|
out_proj_std = (module.embed_dim**-0.5) * factor
|
|
@@ -601,6 +602,8 @@ class Owlv2PreTrainedModel(PreTrainedModel):
|
|
|
601
602
|
std=module.vision_embed_dim**-0.5 * factor,
|
|
602
603
|
)
|
|
603
604
|
init.constant_(module.logit_scale, self.config.logit_scale_init_value)
|
|
605
|
+
elif isinstance(module, Owlv2ForObjectDetection):
|
|
606
|
+
init.copy_(module.box_bias, module.compute_box_bias(module.num_patches_height, module.num_patches_width))
|
|
604
607
|
if isinstance(module, nn.LayerNorm):
|
|
605
608
|
init.zeros_(module.bias)
|
|
606
609
|
init.ones_(module.weight)
|
|
@@ -1222,7 +1225,9 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel):
|
|
|
1222
1225
|
self.config = config
|
|
1223
1226
|
self.num_patches_height = self.config.vision_config.image_size // self.config.vision_config.patch_size
|
|
1224
1227
|
self.num_patches_width = self.config.vision_config.image_size // self.config.vision_config.patch_size
|
|
1225
|
-
self.
|
|
1228
|
+
self.register_buffer(
|
|
1229
|
+
"box_bias", self.compute_box_bias(self.num_patches_height, self.num_patches_width), persistent=False
|
|
1230
|
+
)
|
|
1226
1231
|
|
|
1227
1232
|
# Initialize weights and apply final processing
|
|
1228
1233
|
self.post_init()
|
|
@@ -1259,7 +1264,6 @@ class Owlv2ForObjectDetection(Owlv2PreTrainedModel):
|
|
|
1259
1264
|
objectness_logits = objectness_logits[..., 0]
|
|
1260
1265
|
return objectness_logits
|
|
1261
1266
|
|
|
1262
|
-
@lru_cache(maxsize=2)
|
|
1263
1267
|
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.compute_box_bias
|
|
1264
1268
|
def compute_box_bias(
|
|
1265
1269
|
self, num_patches_height: int, num_patches_width: int, feature_map: Optional[torch.FloatTensor] = None
|
|
@@ -205,8 +205,6 @@ class Owlv2ImageProcessorFast(OwlViTImageProcessorFast):
|
|
|
205
205
|
|
|
206
206
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
207
207
|
|
|
208
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
209
|
-
|
|
210
208
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
211
209
|
|
|
212
210
|
|
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
"""PyTorch OWL-ViT model."""
|
|
16
16
|
|
|
17
17
|
from dataclasses import dataclass
|
|
18
|
-
from functools import lru_cache
|
|
19
18
|
from typing import Any, Optional, Union
|
|
20
19
|
|
|
21
20
|
import torch
|
|
@@ -562,10 +561,12 @@ class OwlViTPreTrainedModel(PreTrainedModel):
|
|
|
562
561
|
if isinstance(module, OwlViTTextEmbeddings):
|
|
563
562
|
init.normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02)
|
|
564
563
|
init.normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02)
|
|
564
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
565
565
|
elif isinstance(module, OwlViTVisionEmbeddings):
|
|
566
566
|
init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
|
|
567
567
|
init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
|
|
568
568
|
init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
|
|
569
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
569
570
|
elif isinstance(module, OwlViTAttention):
|
|
570
571
|
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
|
|
571
572
|
out_proj_std = (module.embed_dim**-0.5) * factor
|
|
@@ -588,6 +589,8 @@ class OwlViTPreTrainedModel(PreTrainedModel):
|
|
|
588
589
|
std=module.vision_embed_dim**-0.5 * factor,
|
|
589
590
|
)
|
|
590
591
|
init.constant_(module.logit_scale, self.config.logit_scale_init_value)
|
|
592
|
+
elif isinstance(module, OwlViTForObjectDetection):
|
|
593
|
+
init.copy_(module.box_bias, module.compute_box_bias(module.num_patches_height, module.num_patches_width))
|
|
591
594
|
if isinstance(module, nn.LayerNorm):
|
|
592
595
|
init.zeros_(module.bias)
|
|
593
596
|
init.ones_(module.weight)
|
|
@@ -1200,7 +1203,9 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
|
|
|
1200
1203
|
self.config = config
|
|
1201
1204
|
self.num_patches_height = self.config.vision_config.image_size // self.config.vision_config.patch_size
|
|
1202
1205
|
self.num_patches_width = self.config.vision_config.image_size // self.config.vision_config.patch_size
|
|
1203
|
-
self.
|
|
1206
|
+
self.register_buffer(
|
|
1207
|
+
"box_bias", self.compute_box_bias(self.num_patches_height, self.num_patches_width), persistent=False
|
|
1208
|
+
)
|
|
1204
1209
|
|
|
1205
1210
|
self.post_init()
|
|
1206
1211
|
|
|
@@ -1221,7 +1226,6 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
|
|
|
1221
1226
|
|
|
1222
1227
|
return box_coordinates
|
|
1223
1228
|
|
|
1224
|
-
@lru_cache(maxsize=2)
|
|
1225
1229
|
def compute_box_bias(
|
|
1226
1230
|
self, num_patches_height: int, num_patches_width: int, feature_map: Optional[torch.FloatTensor] = None
|
|
1227
1231
|
) -> torch.Tensor:
|
|
@@ -165,8 +165,9 @@ class PaddleOCRVLImageProcessor(BaseImageProcessor):
|
|
|
165
165
|
**kwargs,
|
|
166
166
|
) -> None:
|
|
167
167
|
super().__init__(**kwargs)
|
|
168
|
-
if size is not None
|
|
169
|
-
|
|
168
|
+
if size is not None:
|
|
169
|
+
if "shortest_edge" not in size or "longest_edge" not in size:
|
|
170
|
+
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
|
170
171
|
else:
|
|
171
172
|
size = {"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 1280}
|
|
172
173
|
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
|
@@ -30,6 +30,7 @@ from typing import Any, Optional, Union
|
|
|
30
30
|
import torch
|
|
31
31
|
from torch import nn
|
|
32
32
|
|
|
33
|
+
from ... import initialization as init
|
|
33
34
|
from ...activations import ACT2FN, GELUActivation
|
|
34
35
|
from ...cache_utils import Cache, DynamicCache
|
|
35
36
|
from ...generation import GenerationMixin
|
|
@@ -90,6 +91,8 @@ class PaddleOCRVisionRotaryEmbedding(nn.Module):
|
|
|
90
91
|
|
|
91
92
|
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
92
93
|
super().__init__()
|
|
94
|
+
self.dim = dim
|
|
95
|
+
self.theta = theta
|
|
93
96
|
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
94
97
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
95
98
|
|
|
@@ -116,7 +119,7 @@ class PaddleOCRRotaryEmbedding(nn.Module):
|
|
|
116
119
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
117
120
|
|
|
118
121
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
119
|
-
self.original_inv_freq =
|
|
122
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
120
123
|
|
|
121
124
|
@staticmethod
|
|
122
125
|
def compute_default_rope_parameters(
|
|
@@ -444,6 +447,14 @@ class PaddleOCRVLPreTrainedModel(PreTrainedModel):
|
|
|
444
447
|
"attentions": PaddleOCRAttention,
|
|
445
448
|
}
|
|
446
449
|
|
|
450
|
+
def _init_weights(self, module):
|
|
451
|
+
super()._init_weights(module)
|
|
452
|
+
if isinstance(module, PaddleOCRVisionEmbeddings):
|
|
453
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
454
|
+
elif isinstance(module, PaddleOCRVisionRotaryEmbedding):
|
|
455
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
456
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
457
|
+
|
|
447
458
|
|
|
448
459
|
@auto_docstring
|
|
449
460
|
class PaddleOCRTextModel(PaddleOCRVLPreTrainedModel):
|
|
@@ -859,18 +870,17 @@ class PaddleOCRVisionEncoder(nn.Module):
|
|
|
859
870
|
attention_mask: Optional[torch.Tensor] = None,
|
|
860
871
|
image_grid_thw: Optional[list[Union[tuple[int, int, int], list[tuple[int, int, int]]]]] = None,
|
|
861
872
|
) -> BaseModelOutput:
|
|
862
|
-
"""
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
attention_mask
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
The temporal, height and width of feature shape of each image in LLM.
|
|
873
|
+
r"""
|
|
874
|
+
inputs_embeds (`torch.FloatTensor` of shape `(sequence_length, hidden_size)`, *optional*):
|
|
875
|
+
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
|
876
|
+
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
|
877
|
+
than the model's internal embedding lookup matrix.
|
|
878
|
+
cu_seqlens (`torch.Tensor` of shape `(num_images + 1,)`):
|
|
879
|
+
The cumulative sequence lengths of each image or video feature.
|
|
880
|
+
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
881
|
+
The attention_mask used in forward function shape [batch_size X sequence_length] if not None.
|
|
882
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
883
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
874
884
|
"""
|
|
875
885
|
device = inputs_embeds.device
|
|
876
886
|
hidden_states = inputs_embeds
|
|
@@ -919,6 +929,8 @@ class PaddleOCRVisionTransformer(PaddleOCRVLPreTrainedModel):
|
|
|
919
929
|
self.encoder = PaddleOCRVisionEncoder(config)
|
|
920
930
|
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
|
921
931
|
|
|
932
|
+
self.post_init()
|
|
933
|
+
|
|
922
934
|
def forward(
|
|
923
935
|
self,
|
|
924
936
|
pixel_values: torch.FloatTensor,
|
|
@@ -1466,6 +1478,7 @@ class PaddleOCRVLForConditionalGeneration(PaddleOCRVLPreTrainedModel, Generation
|
|
|
1466
1478
|
pixel_values_videos=None,
|
|
1467
1479
|
image_grid_thw=None,
|
|
1468
1480
|
video_grid_thw=None,
|
|
1481
|
+
is_first_iteration=False,
|
|
1469
1482
|
**kwargs,
|
|
1470
1483
|
):
|
|
1471
1484
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1482,6 +1495,7 @@ class PaddleOCRVLForConditionalGeneration(PaddleOCRVLPreTrainedModel, Generation
|
|
|
1482
1495
|
image_grid_thw=image_grid_thw,
|
|
1483
1496
|
video_grid_thw=video_grid_thw,
|
|
1484
1497
|
use_cache=use_cache,
|
|
1498
|
+
is_first_iteration=is_first_iteration,
|
|
1485
1499
|
**kwargs,
|
|
1486
1500
|
)
|
|
1487
1501
|
|
|
@@ -1513,7 +1527,7 @@ class PaddleOCRVLForConditionalGeneration(PaddleOCRVLPreTrainedModel, Generation
|
|
|
1513
1527
|
text_positions = model_inputs["position_ids"][None, ...]
|
|
1514
1528
|
model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
|
|
1515
1529
|
|
|
1516
|
-
if
|
|
1530
|
+
if not is_first_iteration and use_cache:
|
|
1517
1531
|
model_inputs["pixel_values"] = None
|
|
1518
1532
|
model_inputs["pixel_values_videos"] = None
|
|
1519
1533
|
|