transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,7 @@ from torch import nn
|
|
|
13
13
|
from ...activations import ACT2FN
|
|
14
14
|
from ...cache_utils import Cache, DynamicCache
|
|
15
15
|
from ...generation import GenerationMixin
|
|
16
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
|
|
16
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
17
17
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
18
18
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
19
19
|
from ...modeling_layers import (
|
|
@@ -27,7 +27,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
27
27
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
28
28
|
from ...processing_utils import Unpack
|
|
29
29
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
30
|
-
from ...utils.generic import check_model_inputs
|
|
30
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
31
31
|
from .configuration_ministral import MinistralConfig
|
|
32
32
|
|
|
33
33
|
|
|
@@ -120,6 +120,7 @@ def eager_attention_forward(
|
|
|
120
120
|
return attn_output, attn_weights
|
|
121
121
|
|
|
122
122
|
|
|
123
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
123
124
|
class MinistralAttention(nn.Module):
|
|
124
125
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
125
126
|
|
|
@@ -138,7 +139,6 @@ class MinistralAttention(nn.Module):
|
|
|
138
139
|
self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
|
|
139
140
|
self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
|
|
140
141
|
self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
|
|
141
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
142
142
|
self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None
|
|
143
143
|
|
|
144
144
|
def forward(
|
|
@@ -289,7 +289,7 @@ class MinistralRotaryEmbedding(nn.Module):
|
|
|
289
289
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
290
290
|
|
|
291
291
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
292
|
-
self.original_inv_freq =
|
|
292
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
293
293
|
|
|
294
294
|
@staticmethod
|
|
295
295
|
def compute_default_rope_parameters(
|
|
@@ -328,7 +328,7 @@ class MinistralRotaryEmbedding(nn.Module):
|
|
|
328
328
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
329
329
|
|
|
330
330
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
331
|
-
with
|
|
331
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
332
332
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
333
333
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
334
334
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -193,7 +193,7 @@ class Ministral3Config(PreTrainedConfig):
|
|
|
193
193
|
bos_token_id=bos_token_id,
|
|
194
194
|
eos_token_id=eos_token_id,
|
|
195
195
|
tie_word_embeddings=tie_word_embeddings,
|
|
196
|
-
ignore_keys_at_rope_validation={"llama_4_scaling_beta"},
|
|
196
|
+
ignore_keys_at_rope_validation={"llama_4_scaling_beta", "max_position_embeddings"},
|
|
197
197
|
**kwargs,
|
|
198
198
|
)
|
|
199
199
|
|
|
@@ -15,7 +15,7 @@ from transformers.utils.generic import check_model_inputs
|
|
|
15
15
|
from ...activations import ACT2FN
|
|
16
16
|
from ...cache_utils import Cache, DynamicCache
|
|
17
17
|
from ...generation import GenerationMixin
|
|
18
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
|
|
18
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
19
19
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
20
20
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
21
21
|
from ...modeling_layers import (
|
|
@@ -29,6 +29,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
29
29
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
30
30
|
from ...processing_utils import Unpack
|
|
31
31
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
32
|
+
from ...utils.generic import maybe_autocast
|
|
32
33
|
from .configuration_ministral3 import Ministral3Config
|
|
33
34
|
|
|
34
35
|
|
|
@@ -110,6 +111,7 @@ def _get_llama_4_attn_scale(positions_ids: torch.Tensor, beta: float, max_positi
|
|
|
110
111
|
return scaling.unsqueeze(-1)
|
|
111
112
|
|
|
112
113
|
|
|
114
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
113
115
|
class Ministral3Attention(nn.Module):
|
|
114
116
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
115
117
|
|
|
@@ -126,7 +128,6 @@ class Ministral3Attention(nn.Module):
|
|
|
126
128
|
self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
|
|
127
129
|
self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
|
|
128
130
|
self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
|
|
129
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
130
131
|
|
|
131
132
|
def forward(
|
|
132
133
|
self,
|
|
@@ -294,7 +295,7 @@ class Ministral3RotaryEmbedding(nn.Module):
|
|
|
294
295
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
295
296
|
|
|
296
297
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
297
|
-
self.original_inv_freq =
|
|
298
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
298
299
|
|
|
299
300
|
@staticmethod
|
|
300
301
|
def compute_default_rope_parameters(
|
|
@@ -333,7 +334,7 @@ class Ministral3RotaryEmbedding(nn.Module):
|
|
|
333
334
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
334
335
|
|
|
335
336
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
336
|
-
with
|
|
337
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
337
338
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
338
339
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
339
340
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -15,7 +15,7 @@ from transformers.utils.generic import check_model_inputs
|
|
|
15
15
|
from ...activations import ACT2FN
|
|
16
16
|
from ...cache_utils import Cache, DynamicCache
|
|
17
17
|
from ...generation import GenerationMixin
|
|
18
|
-
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
|
|
18
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
|
|
19
19
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
20
20
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
21
21
|
from ...modeling_layers import (
|
|
@@ -29,6 +29,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
29
29
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
30
30
|
from ...processing_utils import Unpack
|
|
31
31
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
32
|
+
from ...utils.generic import maybe_autocast
|
|
32
33
|
from .configuration_mistral import MistralConfig
|
|
33
34
|
|
|
34
35
|
|
|
@@ -121,6 +122,7 @@ def eager_attention_forward(
|
|
|
121
122
|
return attn_output, attn_weights
|
|
122
123
|
|
|
123
124
|
|
|
125
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
124
126
|
class MistralAttention(nn.Module):
|
|
125
127
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
126
128
|
|
|
@@ -137,7 +139,6 @@ class MistralAttention(nn.Module):
|
|
|
137
139
|
self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
|
|
138
140
|
self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
|
|
139
141
|
self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
|
|
140
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
141
142
|
|
|
142
143
|
def forward(
|
|
143
144
|
self,
|
|
@@ -284,7 +285,7 @@ class MistralRotaryEmbedding(nn.Module):
|
|
|
284
285
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
285
286
|
|
|
286
287
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
287
|
-
self.original_inv_freq =
|
|
288
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
288
289
|
|
|
289
290
|
@staticmethod
|
|
290
291
|
def compute_default_rope_parameters(
|
|
@@ -323,7 +324,7 @@ class MistralRotaryEmbedding(nn.Module):
|
|
|
323
324
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
324
325
|
|
|
325
326
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
326
|
-
with
|
|
327
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
327
328
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
328
329
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
329
330
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -252,7 +252,9 @@ class Mistral3Model(Mistral3PreTrainedModel):
|
|
|
252
252
|
|
|
253
253
|
image_features = self.multi_modal_projector(selected_image_feature.squeeze(0), image_sizes)
|
|
254
254
|
downsample_ratio = self.vision_tower.patch_size * self.config.spatial_merge_size
|
|
255
|
-
split_sizes =
|
|
255
|
+
split_sizes = (
|
|
256
|
+
(torch.as_tensor(image_sizes, device=image_features.device) // downsample_ratio).prod(dim=-1).tolist()
|
|
257
|
+
)
|
|
256
258
|
image_features = torch.split(image_features.squeeze(0), split_sizes)
|
|
257
259
|
return image_features
|
|
258
260
|
|
|
@@ -489,6 +491,7 @@ class Mistral3ForConditionalGeneration(Mistral3PreTrainedModel, GenerationMixin)
|
|
|
489
491
|
attention_mask=None,
|
|
490
492
|
cache_position=None,
|
|
491
493
|
logits_to_keep=None,
|
|
494
|
+
is_first_iteration=False,
|
|
492
495
|
**kwargs,
|
|
493
496
|
):
|
|
494
497
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -500,12 +503,15 @@ class Mistral3ForConditionalGeneration(Mistral3PreTrainedModel, GenerationMixin)
|
|
|
500
503
|
attention_mask=attention_mask,
|
|
501
504
|
cache_position=cache_position,
|
|
502
505
|
logits_to_keep=logits_to_keep,
|
|
506
|
+
is_first_iteration=is_first_iteration,
|
|
503
507
|
**kwargs,
|
|
504
508
|
)
|
|
505
509
|
|
|
506
|
-
if
|
|
507
|
-
#
|
|
508
|
-
#
|
|
510
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
511
|
+
# Pixel values are used only in the first iteration if available
|
|
512
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
513
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
514
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
509
515
|
model_inputs["pixel_values"] = pixel_values
|
|
510
516
|
|
|
511
517
|
return model_inputs
|
|
@@ -157,7 +157,9 @@ class Mistral3Model(LlavaModel):
|
|
|
157
157
|
|
|
158
158
|
image_features = self.multi_modal_projector(selected_image_feature.squeeze(0), image_sizes)
|
|
159
159
|
downsample_ratio = self.vision_tower.patch_size * self.config.spatial_merge_size
|
|
160
|
-
split_sizes =
|
|
160
|
+
split_sizes = (
|
|
161
|
+
(torch.as_tensor(image_sizes, device=image_features.device) // downsample_ratio).prod(dim=-1).tolist()
|
|
162
|
+
)
|
|
161
163
|
image_features = torch.split(image_features.squeeze(0), split_sizes)
|
|
162
164
|
return image_features
|
|
163
165
|
|
|
@@ -37,7 +37,12 @@ from ... import initialization as init
|
|
|
37
37
|
from ...activations import ACT2FN
|
|
38
38
|
from ...cache_utils import Cache, DynamicCache
|
|
39
39
|
from ...generation import GenerationMixin
|
|
40
|
-
from ...integrations import
|
|
40
|
+
from ...integrations import (
|
|
41
|
+
use_experts_implementation,
|
|
42
|
+
use_kernel_forward_from_hub,
|
|
43
|
+
use_kernel_func_from_hub,
|
|
44
|
+
use_kernelized_func,
|
|
45
|
+
)
|
|
41
46
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
42
47
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
43
48
|
from ...modeling_layers import (
|
|
@@ -50,11 +55,12 @@ from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPas
|
|
|
50
55
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
51
56
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
52
57
|
from ...processing_utils import Unpack
|
|
53
|
-
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
54
|
-
from ...utils.generic import OutputRecorder
|
|
58
|
+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
|
|
59
|
+
from ...utils.generic import OutputRecorder, maybe_autocast
|
|
55
60
|
from .configuration_mixtral import MixtralConfig
|
|
56
61
|
|
|
57
62
|
|
|
63
|
+
@use_experts_implementation
|
|
58
64
|
class MixtralExperts(nn.Module):
|
|
59
65
|
"""Collection of expert weights stored as 3D tensors."""
|
|
60
66
|
|
|
@@ -169,7 +175,7 @@ class MixtralRotaryEmbedding(nn.Module):
|
|
|
169
175
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
170
176
|
|
|
171
177
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
172
|
-
self.original_inv_freq =
|
|
178
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
173
179
|
|
|
174
180
|
@staticmethod
|
|
175
181
|
def compute_default_rope_parameters(
|
|
@@ -208,7 +214,7 @@ class MixtralRotaryEmbedding(nn.Module):
|
|
|
208
214
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
209
215
|
|
|
210
216
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
211
|
-
with
|
|
217
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
212
218
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
213
219
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
214
220
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -290,6 +296,7 @@ def eager_attention_forward(
|
|
|
290
296
|
return attn_output, attn_weights
|
|
291
297
|
|
|
292
298
|
|
|
299
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
293
300
|
class MixtralAttention(nn.Module):
|
|
294
301
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
295
302
|
|
|
@@ -306,7 +313,6 @@ class MixtralAttention(nn.Module):
|
|
|
306
313
|
self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
|
|
307
314
|
self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
|
|
308
315
|
self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
|
|
309
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
310
316
|
|
|
311
317
|
def forward(
|
|
312
318
|
self,
|
|
@@ -403,7 +409,9 @@ class MixtralPreTrainedModel(PreTrainedModel):
|
|
|
403
409
|
_supports_flash_attn = True
|
|
404
410
|
_supports_sdpa = True
|
|
405
411
|
_supports_flex_attn = True
|
|
406
|
-
_can_compile_fullgraph =
|
|
412
|
+
_can_compile_fullgraph = (
|
|
413
|
+
is_grouped_mm_available()
|
|
414
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
407
415
|
_supports_attention_backend = True
|
|
408
416
|
_can_record_outputs = {
|
|
409
417
|
"router_logits": OutputRecorder(MixtralTopKRouter, index=0),
|
|
@@ -28,12 +28,13 @@ from torch import nn
|
|
|
28
28
|
from ... import initialization as init
|
|
29
29
|
from ...activations import ACT2FN
|
|
30
30
|
from ...cache_utils import Cache, DynamicCache
|
|
31
|
+
from ...integrations import use_experts_implementation
|
|
31
32
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
32
33
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
33
34
|
from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
|
|
34
35
|
from ...modeling_utils import PreTrainedModel
|
|
35
36
|
from ...processing_utils import Unpack
|
|
36
|
-
from ...utils import TransformersKwargs, logging
|
|
37
|
+
from ...utils import TransformersKwargs, is_grouped_mm_available, logging
|
|
37
38
|
from ...utils.generic import OutputRecorder
|
|
38
39
|
from ..mistral.modeling_mistral import (
|
|
39
40
|
MistralAttention,
|
|
@@ -134,6 +135,7 @@ def load_balancing_loss_func(
|
|
|
134
135
|
return overall_loss * num_experts
|
|
135
136
|
|
|
136
137
|
|
|
138
|
+
@use_experts_implementation
|
|
137
139
|
class MixtralExperts(nn.Module):
|
|
138
140
|
"""Collection of expert weights stored as 3D tensors."""
|
|
139
141
|
|
|
@@ -263,7 +265,9 @@ class MixtralDecoderLayer(GradientCheckpointingLayer):
|
|
|
263
265
|
|
|
264
266
|
|
|
265
267
|
class MixtralPreTrainedModel(MistralPreTrainedModel):
|
|
266
|
-
_can_compile_fullgraph =
|
|
268
|
+
_can_compile_fullgraph = (
|
|
269
|
+
is_grouped_mm_available()
|
|
270
|
+
) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
|
|
267
271
|
_can_record_outputs = {
|
|
268
272
|
"router_logits": OutputRecorder(MixtralTopKRouter, index=0),
|
|
269
273
|
"hidden_states": MixtralDecoderLayer,
|
|
@@ -55,6 +55,8 @@ class MLCDRotaryEmbedding(nn.Module):
|
|
|
55
55
|
|
|
56
56
|
def __init__(self, dim: int, theta: float = 10000.0) -> None:
|
|
57
57
|
super().__init__()
|
|
58
|
+
self.dim = dim
|
|
59
|
+
self.theta = theta
|
|
58
60
|
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
|
59
61
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
60
62
|
|
|
@@ -424,6 +426,7 @@ class MLCDPreTrainedModel(PreTrainedModel):
|
|
|
424
426
|
factor = self.config.initializer_factor
|
|
425
427
|
init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
|
|
426
428
|
init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
|
|
429
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
427
430
|
elif isinstance(module, MLCDAttention):
|
|
428
431
|
factor = self.config.initializer_factor
|
|
429
432
|
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
|
|
@@ -447,6 +450,9 @@ class MLCDPreTrainedModel(PreTrainedModel):
|
|
|
447
450
|
init.ones_(module.weight)
|
|
448
451
|
elif isinstance(module, nn.Linear) and module.bias is not None:
|
|
449
452
|
init.zeros_(module.bias)
|
|
453
|
+
elif isinstance(module, MLCDRotaryEmbedding):
|
|
454
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
455
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
450
456
|
|
|
451
457
|
|
|
452
458
|
class MLCDVisionTransformer(nn.Module):
|
|
@@ -363,6 +363,7 @@ class MLCDPreTrainedModel(PreTrainedModel):
|
|
|
363
363
|
factor = self.config.initializer_factor
|
|
364
364
|
init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
|
|
365
365
|
init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
|
|
366
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
366
367
|
elif isinstance(module, MLCDAttention):
|
|
367
368
|
factor = self.config.initializer_factor
|
|
368
369
|
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
|
|
@@ -386,6 +387,9 @@ class MLCDPreTrainedModel(PreTrainedModel):
|
|
|
386
387
|
init.ones_(module.weight)
|
|
387
388
|
elif isinstance(module, nn.Linear) and module.bias is not None:
|
|
388
389
|
init.zeros_(module.bias)
|
|
390
|
+
elif isinstance(module, MLCDRotaryEmbedding):
|
|
391
|
+
inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
|
|
392
|
+
init.copy_(module.inv_freq, inv_freq)
|
|
389
393
|
|
|
390
394
|
|
|
391
395
|
class MLCDVisionTransformer(CLIPVisionTransformer):
|
|
@@ -37,7 +37,7 @@ from ...modeling_rope_utils import (
|
|
|
37
37
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
38
38
|
from ...processing_utils import Unpack
|
|
39
39
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
|
|
40
|
-
from ...utils.generic import OutputRecorder, check_model_inputs
|
|
40
|
+
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
|
|
41
41
|
from .configuration_mllama import MllamaConfig, MllamaTextConfig, MllamaVisionConfig
|
|
42
42
|
|
|
43
43
|
|
|
@@ -741,7 +741,7 @@ class MllamaRotaryEmbedding(nn.Module):
|
|
|
741
741
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
742
742
|
|
|
743
743
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
744
|
-
self.original_inv_freq =
|
|
744
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
745
745
|
|
|
746
746
|
@staticmethod
|
|
747
747
|
def compute_default_rope_parameters(
|
|
@@ -781,7 +781,7 @@ class MllamaRotaryEmbedding(nn.Module):
|
|
|
781
781
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
782
782
|
|
|
783
783
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
784
|
-
with
|
|
784
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
785
785
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
786
786
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
787
787
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -847,6 +847,15 @@ class MllamaPreTrainedModel(PreTrainedModel):
|
|
|
847
847
|
elif isinstance(module, MllamaPrecomputedAspectRatioEmbedding):
|
|
848
848
|
if module.is_gated:
|
|
849
849
|
init.zeros_(module.gate)
|
|
850
|
+
elif isinstance(module, MllamaRotaryEmbedding):
|
|
851
|
+
rope_fn = (
|
|
852
|
+
ROPE_INIT_FUNCTIONS[module.rope_type]
|
|
853
|
+
if module.rope_type != "default"
|
|
854
|
+
else module.compute_default_rope_parameters
|
|
855
|
+
)
|
|
856
|
+
buffer_value, _ = rope_fn(module.config)
|
|
857
|
+
init.copy_(module.inv_freq, buffer_value)
|
|
858
|
+
init.copy_(module.original_inv_freq, buffer_value)
|
|
850
859
|
|
|
851
860
|
# Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask
|
|
852
861
|
def _update_causal_mask(
|
|
@@ -1721,6 +1730,7 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
|
|
|
1721
1730
|
use_cache=False,
|
|
1722
1731
|
cache_position=None,
|
|
1723
1732
|
logits_to_keep=None,
|
|
1733
|
+
is_first_iteration=False,
|
|
1724
1734
|
**kwargs,
|
|
1725
1735
|
):
|
|
1726
1736
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1738,12 +1748,13 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
|
|
|
1738
1748
|
cross_attention_mask=cross_attention_mask,
|
|
1739
1749
|
cache_position=cache_position,
|
|
1740
1750
|
logits_to_keep=logits_to_keep,
|
|
1751
|
+
is_first_iteration=is_first_iteration,
|
|
1741
1752
|
**kwargs,
|
|
1742
1753
|
)
|
|
1743
1754
|
|
|
1744
1755
|
# If we're in pre-fill or cacheless decoding step, then we need pixel_values and aspect ratios
|
|
1745
1756
|
# to compute image hidden states, otherwise they are cached within each cross attn layer
|
|
1746
|
-
if
|
|
1757
|
+
if not is_first_iteration and use_cache:
|
|
1747
1758
|
model_inputs["pixel_values"] = None
|
|
1748
1759
|
model_inputs["aspect_ratio_ids"] = None
|
|
1749
1760
|
model_inputs["aspect_ratio_mask"] = None
|
|
@@ -234,8 +234,8 @@ class MLukeTokenizer(TokenizersBackend):
|
|
|
234
234
|
entity_pad_token="[PAD]",
|
|
235
235
|
entity_mask_token="[MASK]",
|
|
236
236
|
entity_mask2_token="[MASK2]",
|
|
237
|
-
vocab: Optional[list] = None,
|
|
238
|
-
entity_vocab: Optional[dict] = None,
|
|
237
|
+
vocab: Optional[Union[str, dict, list]] = None,
|
|
238
|
+
entity_vocab: Optional[Union[str, dict, list]] = None,
|
|
239
239
|
**kwargs,
|
|
240
240
|
) -> None:
|
|
241
241
|
# Mask token behave like a normal word, i.e. include the space before it
|
|
@@ -263,10 +263,13 @@ class MLukeTokenizer(TokenizersBackend):
|
|
|
263
263
|
entity_vocab = kwargs.pop("entity_vocab")
|
|
264
264
|
|
|
265
265
|
# Build vocab from data (list of (token, score) tuples)
|
|
266
|
-
if vocab
|
|
266
|
+
if isinstance(vocab, list):
|
|
267
267
|
# vocab is list of (token, score) tuples from SentencePieceExtractor
|
|
268
268
|
self._vocab = [(token, float(score)) for token, score in vocab]
|
|
269
269
|
self._vocab_size = len(self._vocab)
|
|
270
|
+
elif vocab is not None:
|
|
271
|
+
self._vocab = vocab
|
|
272
|
+
self._vocab_size = 0
|
|
270
273
|
else:
|
|
271
274
|
# Create minimal vocab with <unk> to satisfy Unigram requirements
|
|
272
275
|
self._vocab = [("<unk>", 0.0)]
|
|
@@ -365,10 +368,7 @@ class MLukeTokenizer(TokenizersBackend):
|
|
|
365
368
|
|
|
366
369
|
kwargs["extra_special_tokens"] = extra_tokens
|
|
367
370
|
|
|
368
|
-
tokenizer_object = self._tokenizer
|
|
369
|
-
|
|
370
371
|
super().__init__(
|
|
371
|
-
tokenizer_object=tokenizer_object,
|
|
372
372
|
bos_token=bos_token,
|
|
373
373
|
eos_token=eos_token,
|
|
374
374
|
unk_token=unk_token,
|
|
@@ -38,7 +38,7 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
38
38
|
documentation from [`PreTrainedConfig`] for more information.
|
|
39
39
|
|
|
40
40
|
Args:
|
|
41
|
-
backbone_config (`PreTrainedConfig
|
|
41
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
|
|
42
42
|
The configuration of the backbone model.
|
|
43
43
|
backbone (`str`, *optional*):
|
|
44
44
|
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
@@ -280,7 +280,6 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
280
280
|
self.layer_norm_eps = layer_norm_eps
|
|
281
281
|
|
|
282
282
|
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
|
|
283
|
-
self.tie_encoder_decoder = True
|
|
284
283
|
|
|
285
284
|
|
|
286
285
|
__all__ = ["MMGroundingDinoConfig"]
|
|
@@ -552,7 +552,7 @@ class MMGroundingDinoPreTrainedModel(PreTrainedModel):
|
|
|
552
552
|
elif isinstance(module, MMGroundingDinoFusionLayer):
|
|
553
553
|
init.constant_(module.vision_param, 1e-4)
|
|
554
554
|
init.constant_(module.text_param, 1e-4)
|
|
555
|
-
elif isinstance(module, (nn.Linear, nn.Conv2d
|
|
555
|
+
elif isinstance(module, (nn.Linear, nn.Conv2d)):
|
|
556
556
|
init.normal_(module.weight, mean=0.0, std=std)
|
|
557
557
|
if module.bias is not None:
|
|
558
558
|
init.zeros_(module.bias)
|
|
@@ -1180,7 +1180,8 @@ class MMGroundingDinoEncoder(MMGroundingDinoPreTrainedModel):
|
|
|
1180
1180
|
output_attentions=None,
|
|
1181
1181
|
output_hidden_states=None,
|
|
1182
1182
|
return_dict=None,
|
|
1183
|
-
|
|
1183
|
+
**kwargs,
|
|
1184
|
+
) -> Union[tuple, MMGroundingDinoEncoderOutput]:
|
|
1184
1185
|
r"""
|
|
1185
1186
|
Args:
|
|
1186
1187
|
vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
|
@@ -1476,7 +1477,8 @@ class MMGroundingDinoDecoder(MMGroundingDinoPreTrainedModel):
|
|
|
1476
1477
|
output_attentions=None,
|
|
1477
1478
|
output_hidden_states=None,
|
|
1478
1479
|
return_dict=None,
|
|
1479
|
-
|
|
1480
|
+
**kwargs,
|
|
1481
|
+
) -> Union[tuple, MMGroundingDinoDecoderOutput]:
|
|
1480
1482
|
r"""
|
|
1481
1483
|
Args:
|
|
1482
1484
|
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
|
@@ -1951,7 +1953,8 @@ class MMGroundingDinoModel(MMGroundingDinoPreTrainedModel):
|
|
|
1951
1953
|
output_attentions=None,
|
|
1952
1954
|
output_hidden_states=None,
|
|
1953
1955
|
return_dict=None,
|
|
1954
|
-
|
|
1956
|
+
**kwargs,
|
|
1957
|
+
) -> Union[tuple, MMGroundingDinoModelOutput]:
|
|
1955
1958
|
r"""
|
|
1956
1959
|
input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`):
|
|
1957
1960
|
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
@@ -2431,6 +2434,7 @@ class MMGroundingDinoForObjectDetection(MMGroundingDinoPreTrainedModel):
|
|
|
2431
2434
|
output_hidden_states: Optional[bool] = None,
|
|
2432
2435
|
return_dict: Optional[bool] = None,
|
|
2433
2436
|
labels: Optional[list[dict[str, Union[torch.LongTensor, torch.FloatTensor]]]] = None,
|
|
2437
|
+
**kwargs,
|
|
2434
2438
|
):
|
|
2435
2439
|
r"""
|
|
2436
2440
|
input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`):
|
|
@@ -51,7 +51,7 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
51
51
|
documentation from [`PreTrainedConfig`] for more information.
|
|
52
52
|
|
|
53
53
|
Args:
|
|
54
|
-
backbone_config (`PreTrainedConfig
|
|
54
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
|
|
55
55
|
The configuration of the backbone model.
|
|
56
56
|
backbone (`str`, *optional*):
|
|
57
57
|
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
|
|
@@ -293,7 +293,6 @@ class MMGroundingDinoConfig(PreTrainedConfig):
|
|
|
293
293
|
self.layer_norm_eps = layer_norm_eps
|
|
294
294
|
|
|
295
295
|
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
|
|
296
|
-
self.tie_encoder_decoder = True
|
|
297
296
|
|
|
298
297
|
|
|
299
298
|
class MMGroundingDinoContrastiveEmbedding(GroundingDinoContrastiveEmbedding):
|
|
@@ -556,6 +556,8 @@ class MobileBertPreTrainedModel(PreTrainedModel):
|
|
|
556
556
|
init.ones_(module.weight)
|
|
557
557
|
elif isinstance(module, MobileBertLMPredictionHead):
|
|
558
558
|
init.zeros_(module.bias)
|
|
559
|
+
elif isinstance(module, MobileBertEmbeddings):
|
|
560
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
559
561
|
|
|
560
562
|
|
|
561
563
|
@dataclass
|
|
@@ -195,6 +195,7 @@ class MobileNetV1Model(MobileNetV1PreTrainedModel):
|
|
|
195
195
|
pixel_values: Optional[torch.Tensor] = None,
|
|
196
196
|
output_hidden_states: Optional[bool] = None,
|
|
197
197
|
return_dict: Optional[bool] = None,
|
|
198
|
+
**kwargs,
|
|
198
199
|
) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
|
|
199
200
|
output_hidden_states = (
|
|
200
201
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
@@ -260,6 +261,7 @@ class MobileNetV1ForImageClassification(MobileNetV1PreTrainedModel):
|
|
|
260
261
|
output_hidden_states: Optional[bool] = None,
|
|
261
262
|
labels: Optional[torch.Tensor] = None,
|
|
262
263
|
return_dict: Optional[bool] = None,
|
|
264
|
+
**kwargs,
|
|
263
265
|
) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
|
|
264
266
|
r"""
|
|
265
267
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -180,7 +180,6 @@ class MobileNetV2ImageProcessorFast(BaseImageProcessorFast):
|
|
|
180
180
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
181
181
|
|
|
182
182
|
# Stack all processed images if return_tensors is specified
|
|
183
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
184
183
|
|
|
185
184
|
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
|
|
186
185
|
|
|
@@ -331,6 +331,7 @@ class MobileNetV2Model(MobileNetV2PreTrainedModel):
|
|
|
331
331
|
pixel_values: Optional[torch.Tensor] = None,
|
|
332
332
|
output_hidden_states: Optional[bool] = None,
|
|
333
333
|
return_dict: Optional[bool] = None,
|
|
334
|
+
**kwargs,
|
|
334
335
|
) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
|
|
335
336
|
output_hidden_states = (
|
|
336
337
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
@@ -396,6 +397,7 @@ class MobileNetV2ForImageClassification(MobileNetV2PreTrainedModel):
|
|
|
396
397
|
output_hidden_states: Optional[bool] = None,
|
|
397
398
|
labels: Optional[torch.Tensor] = None,
|
|
398
399
|
return_dict: Optional[bool] = None,
|
|
400
|
+
**kwargs,
|
|
399
401
|
) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
|
|
400
402
|
r"""
|
|
401
403
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -524,6 +526,7 @@ class MobileNetV2ForSemanticSegmentation(MobileNetV2PreTrainedModel):
|
|
|
524
526
|
labels: Optional[torch.Tensor] = None,
|
|
525
527
|
output_hidden_states: Optional[bool] = None,
|
|
526
528
|
return_dict: Optional[bool] = None,
|
|
529
|
+
**kwargs,
|
|
527
530
|
) -> Union[tuple, SemanticSegmenterOutput]:
|
|
528
531
|
r"""
|
|
529
532
|
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
|
|
@@ -79,7 +79,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
|
|
|
79
79
|
size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
|
|
80
80
|
Controls the size of the output image after resizing. Can be overridden by the `size` parameter in the
|
|
81
81
|
`preprocess` method.
|
|
82
|
-
resample (`PILImageResampling`, *optional*, defaults to `Resampling.
|
|
82
|
+
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
|
83
83
|
Defines the resampling filter to use if resizing the image. Can be overridden by the `resample` parameter
|
|
84
84
|
in the `preprocess` method.
|
|
85
85
|
do_rescale (`bool`, *optional*, defaults to `True`):
|
|
@@ -112,7 +112,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
|
|
|
112
112
|
self,
|
|
113
113
|
do_resize: bool = True,
|
|
114
114
|
size: Optional[dict[str, int]] = None,
|
|
115
|
-
resample: PILImageResampling = PILImageResampling.
|
|
115
|
+
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
|
116
116
|
do_rescale: bool = True,
|
|
117
117
|
rescale_factor: Union[int, float] = 1 / 255,
|
|
118
118
|
do_center_crop: bool = True,
|
|
@@ -137,12 +137,12 @@ class MobileViTImageProcessor(BaseImageProcessor):
|
|
|
137
137
|
self.do_flip_channel_order = do_flip_channel_order
|
|
138
138
|
self.do_reduce_labels = do_reduce_labels
|
|
139
139
|
|
|
140
|
-
# Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize
|
|
140
|
+
# Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize
|
|
141
141
|
def resize(
|
|
142
142
|
self,
|
|
143
143
|
image: np.ndarray,
|
|
144
144
|
size: dict[str, int],
|
|
145
|
-
resample: PILImageResampling = PILImageResampling.
|
|
145
|
+
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
|
146
146
|
data_format: Optional[Union[str, ChannelDimension]] = None,
|
|
147
147
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
|
148
148
|
**kwargs,
|
|
@@ -156,7 +156,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
|
|
|
156
156
|
Image to resize.
|
|
157
157
|
size (`dict[str, int]`):
|
|
158
158
|
Size of the output image.
|
|
159
|
-
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.
|
|
159
|
+
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
|
|
160
160
|
Resampling filter to use when resiizing the image.
|
|
161
161
|
data_format (`str` or `ChannelDimension`, *optional*):
|
|
162
162
|
The channel dimension format of the image. If not provided, it will be the same as the input image.
|