transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +49 -3
- transformers/activations.py +1 -1
- transformers/audio_utils.py +0 -1
- transformers/cache_utils.py +17 -15
- transformers/cli/serve.py +47 -17
- transformers/configuration_utils.py +114 -70
- transformers/conversion_mapping.py +83 -7
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +374 -147
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +2 -3
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +55 -24
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/candidate_generator.py +79 -31
- transformers/generation/configuration_utils.py +165 -124
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/cache.py +47 -18
- transformers/generation/continuous_batching/cache_manager.py +131 -34
- transformers/generation/continuous_batching/continuous_api.py +228 -136
- transformers/generation/continuous_batching/requests.py +28 -1
- transformers/generation/continuous_batching/scheduler.py +11 -4
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/utils.py +108 -110
- transformers/generation/watermarking.py +8 -5
- transformers/image_processing_base.py +3 -14
- transformers/image_processing_utils_fast.py +15 -4
- transformers/initialization.py +37 -0
- transformers/integrations/__init__.py +16 -2
- transformers/integrations/accelerate.py +58 -113
- transformers/integrations/aqlm.py +36 -66
- transformers/integrations/awq.py +46 -515
- transformers/integrations/bitnet.py +47 -105
- transformers/integrations/bitsandbytes.py +91 -202
- transformers/integrations/deepspeed.py +18 -2
- transformers/integrations/eetq.py +84 -81
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +241 -208
- transformers/integrations/flash_attention.py +2 -2
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +37 -62
- transformers/integrations/hub_kernels.py +65 -8
- transformers/integrations/integration_utils.py +45 -0
- transformers/integrations/mistral.py +12 -0
- transformers/integrations/moe.py +240 -0
- transformers/integrations/mxfp4.py +28 -74
- transformers/integrations/peft.py +12 -29
- transformers/integrations/quanto.py +77 -56
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +42 -90
- transformers/integrations/tensor_parallel.py +167 -221
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +40 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +74 -19
- transformers/modeling_rope_utils.py +107 -86
- transformers/modeling_utils.py +611 -527
- transformers/models/__init__.py +22 -0
- transformers/models/afmoe/modeling_afmoe.py +10 -19
- transformers/models/afmoe/modular_afmoe.py +5 -13
- transformers/models/aimv2/modeling_aimv2.py +4 -0
- transformers/models/aimv2/modular_aimv2.py +4 -0
- transformers/models/albert/modeling_albert.py +3 -0
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +14 -6
- transformers/models/altclip/modeling_altclip.py +11 -3
- transformers/models/apertus/modeling_apertus.py +8 -6
- transformers/models/apertus/modular_apertus.py +4 -1
- transformers/models/arcee/modeling_arcee.py +5 -5
- transformers/models/aria/modeling_aria.py +12 -8
- transformers/models/aria/modular_aria.py +7 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
- transformers/models/auto/auto_factory.py +1 -1
- transformers/models/auto/configuration_auto.py +38 -0
- transformers/models/auto/feature_extraction_auto.py +9 -3
- transformers/models/auto/image_processing_auto.py +5 -2
- transformers/models/auto/modeling_auto.py +37 -0
- transformers/models/auto/processing_auto.py +22 -10
- transformers/models/auto/tokenization_auto.py +147 -566
- transformers/models/auto/video_processing_auto.py +5 -2
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +7 -3
- transformers/models/bamba/modeling_bamba.py +21 -21
- transformers/models/bamba/modular_bamba.py +17 -16
- transformers/models/bark/modeling_bark.py +11 -0
- transformers/models/bart/configuration_bart.py +0 -1
- transformers/models/bart/modeling_bart.py +14 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/image_processing_beit_fast.py +0 -1
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/modeling_bert.py +3 -0
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/bert_generation/modeling_bert_generation.py +2 -0
- transformers/models/big_bird/modeling_big_bird.py +9 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +16 -3
- transformers/models/bitnet/modeling_bitnet.py +5 -5
- transformers/models/blenderbot/modeling_blenderbot.py +12 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
- transformers/models/blip/modeling_blip.py +2 -0
- transformers/models/blip/modeling_blip_text.py +10 -0
- transformers/models/blip_2/modeling_blip_2.py +4 -1
- transformers/models/bloom/modeling_bloom.py +17 -44
- transformers/models/blt/modeling_blt.py +164 -4
- transformers/models/blt/modular_blt.py +170 -5
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
- transformers/models/bridgetower/modeling_bridgetower.py +11 -1
- transformers/models/bros/modeling_bros.py +12 -0
- transformers/models/camembert/modeling_camembert.py +109 -106
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +11 -0
- transformers/models/canine/tokenization_canine.py +2 -0
- transformers/models/chameleon/modeling_chameleon.py +11 -5
- transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
- transformers/models/clap/feature_extraction_clap.py +2 -2
- transformers/models/clap/modeling_clap.py +30 -15
- transformers/models/clip/modeling_clip.py +2 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +9 -0
- transformers/models/clvp/modeling_clvp.py +19 -3
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/modeling_codegen.py +13 -4
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +5 -4
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +8 -7
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
- transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
- transformers/models/convbert/modeling_convbert.py +9 -0
- transformers/models/convnext/image_processing_convnext.py +2 -2
- transformers/models/convnext/image_processing_convnext_fast.py +9 -13
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/generation_csm.py +19 -22
- transformers/models/csm/modeling_csm.py +7 -4
- transformers/models/csm/modular_csm.py +2 -0
- transformers/models/ctrl/modeling_ctrl.py +15 -2
- transformers/models/cvt/modeling_cvt.py +7 -1
- transformers/models/cwm/modeling_cwm.py +5 -5
- transformers/models/d_fine/configuration_d_fine.py +3 -4
- transformers/models/d_fine/modeling_d_fine.py +48 -39
- transformers/models/d_fine/modular_d_fine.py +16 -4
- transformers/models/dab_detr/configuration_dab_detr.py +2 -2
- transformers/models/dab_detr/modeling_dab_detr.py +5 -1
- transformers/models/dac/modeling_dac.py +6 -6
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_text.py +7 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/data2vec/modular_data2vec_text.py +7 -0
- transformers/models/dbrx/configuration_dbrx.py +9 -1
- transformers/models/dbrx/modeling_dbrx.py +3 -3
- transformers/models/deberta/modeling_deberta.py +7 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
- transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
- transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
- transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
- transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
- transformers/models/depth_anything/configuration_depth_anything.py +2 -3
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/configuration_detr.py +1 -1
- transformers/models/detr/modeling_detr.py +13 -1
- transformers/models/dia/generation_dia.py +3 -10
- transformers/models/dia/modeling_dia.py +16 -4
- transformers/models/dia/modular_dia.py +11 -1
- transformers/models/dia/processing_dia.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +5 -5
- transformers/models/diffllama/modular_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
- transformers/models/distilbert/modeling_distilbert.py +11 -9
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +3 -4
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/image_processing_donut_fast.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +18 -12
- transformers/models/dots1/modeling_dots1.py +23 -11
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/dpt/configuration_dpt.py +1 -1
- transformers/models/dpt/image_processing_dpt_fast.py +1 -2
- transformers/models/dpt/modular_dpt.py +1 -2
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +6 -3
- transformers/models/edgetam/modular_edgetam.py +15 -14
- transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
- transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
- transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
- transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
- transformers/models/efficientnet/modeling_efficientnet.py +7 -1
- transformers/models/electra/modeling_electra.py +7 -0
- transformers/models/emu3/modeling_emu3.py +12 -6
- transformers/models/emu3/modular_emu3.py +7 -1
- transformers/models/encodec/modeling_encodec.py +14 -0
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +60 -16
- transformers/models/eomt/modeling_eomt.py +7 -0
- transformers/models/eomt/modular_eomt.py +7 -0
- transformers/models/ernie/modeling_ernie.py +6 -0
- transformers/models/ernie/modular_ernie.py +6 -0
- transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
- transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
- transformers/models/esm/modeling_esm.py +6 -0
- transformers/models/esm/modeling_esmfold.py +11 -5
- transformers/models/evolla/modeling_evolla.py +13 -5
- transformers/models/evolla/modular_evolla.py +8 -0
- transformers/models/exaone4/modeling_exaone4.py +3 -3
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +9 -4
- transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
- transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
- transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
- transformers/models/fast_vlm/__init__.py +27 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
- transformers/models/flaubert/modeling_flaubert.py +21 -15
- transformers/models/flava/image_processing_flava_fast.py +0 -2
- transformers/models/flava/modeling_flava.py +10 -2
- transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
- transformers/models/florence2/modeling_florence2.py +22 -4
- transformers/models/florence2/modular_florence2.py +15 -1
- transformers/models/fnet/modeling_fnet.py +14 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/image_processing_fuyu.py +1 -1
- transformers/models/fuyu/modeling_fuyu.py +3 -1
- transformers/models/fuyu/processing_fuyu.py +19 -3
- transformers/models/gemma/modeling_gemma.py +14 -16
- transformers/models/gemma/modular_gemma.py +9 -11
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +5 -5
- transformers/models/gemma2/modular_gemma2.py +3 -2
- transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
- transformers/models/gemma3/modeling_gemma3.py +42 -91
- transformers/models/gemma3/modular_gemma3.py +38 -87
- transformers/models/gemma3n/configuration_gemma3n.py +3 -0
- transformers/models/gemma3n/modeling_gemma3n.py +65 -218
- transformers/models/gemma3n/modular_gemma3n.py +68 -68
- transformers/models/git/modeling_git.py +183 -126
- transformers/models/glm/modeling_glm.py +5 -5
- transformers/models/glm4/modeling_glm4.py +5 -5
- transformers/models/glm46v/image_processing_glm46v.py +0 -4
- transformers/models/glm46v/modeling_glm46v.py +3 -1
- transformers/models/glm46v/modular_glm46v.py +3 -0
- transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
- transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/image_processing_glm4v.py +0 -4
- transformers/models/glm4v/modeling_glm4v.py +18 -8
- transformers/models/glm4v/modular_glm4v.py +17 -7
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
- transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
- transformers/models/glmasr/__init__.py +30 -0
- transformers/models/glmasr/configuration_glmasr.py +197 -0
- transformers/models/glmasr/modeling_glmasr.py +512 -0
- transformers/models/glmasr/modular_glmasr.py +433 -0
- transformers/models/glmasr/processing_glmasr.py +332 -0
- transformers/models/glpn/image_processing_glpn_fast.py +0 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
- transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
- transformers/models/gpt2/modeling_gpt2.py +13 -6
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
- transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
- transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
- transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
- transformers/models/gptj/modeling_gptj.py +18 -6
- transformers/models/granite/modeling_granite.py +5 -5
- transformers/models/granite_speech/modeling_granite_speech.py +15 -1
- transformers/models/granitemoe/modeling_granitemoe.py +6 -9
- transformers/models/granitemoe/modular_granitemoe.py +1 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
- transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
- transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
- transformers/models/groupvit/modeling_groupvit.py +9 -1
- transformers/models/helium/modeling_helium.py +5 -4
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +7 -0
- transformers/models/hubert/modular_hubert.py +5 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
- transformers/models/ibert/modeling_ibert.py +22 -0
- transformers/models/idefics/modeling_idefics.py +15 -21
- transformers/models/idefics2/modeling_idefics2.py +7 -1
- transformers/models/idefics3/modeling_idefics3.py +5 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
- transformers/models/imagegpt/modeling_imagegpt.py +11 -3
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/instructblip/modeling_instructblip.py +2 -0
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
- transformers/models/internvl/modeling_internvl.py +13 -12
- transformers/models/internvl/modular_internvl.py +7 -13
- transformers/models/internvl/video_processing_internvl.py +0 -1
- transformers/models/jais2/__init__.py +27 -0
- transformers/models/jais2/configuration_jais2.py +152 -0
- transformers/models/jais2/modeling_jais2.py +486 -0
- transformers/models/jais2/modular_jais2.py +196 -0
- transformers/models/jamba/modeling_jamba.py +25 -20
- transformers/models/jamba/modular_jamba.py +17 -17
- transformers/models/janus/image_processing_janus_fast.py +0 -1
- transformers/models/janus/modeling_janus.py +16 -7
- transformers/models/janus/modular_janus.py +17 -7
- transformers/models/jetmoe/modeling_jetmoe.py +4 -4
- transformers/models/jetmoe/modular_jetmoe.py +1 -0
- transformers/models/kosmos2/modeling_kosmos2.py +15 -2
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +248 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +730 -0
- transformers/models/lasr/modular_lasr.py +576 -0
- transformers/models/lasr/processing_lasr.py +94 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +10 -3
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +12 -0
- transformers/models/levit/modeling_levit.py +21 -0
- transformers/models/lfm2/modeling_lfm2.py +5 -6
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
- transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
- transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
- transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
- transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +23 -15
- transformers/models/llama/modeling_llama.py +5 -5
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/image_processing_llama4_fast.py +1 -2
- transformers/models/llama4/modeling_llama4.py +11 -6
- transformers/models/llava/image_processing_llava_fast.py +0 -1
- transformers/models/llava/modeling_llava.py +12 -7
- transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
- transformers/models/llava_next/modeling_llava_next.py +7 -3
- transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
- transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
- transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
- transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
- transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
- transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -4
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +14 -0
- transformers/models/mamba/modeling_mamba.py +16 -23
- transformers/models/mamba2/modeling_mamba2.py +24 -23
- transformers/models/marian/configuration_marian.py +1 -1
- transformers/models/marian/modeling_marian.py +8 -0
- transformers/models/markuplm/modeling_markuplm.py +9 -8
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/configuration_mask2former.py +3 -3
- transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
- transformers/models/mask2former/modeling_mask2former.py +11 -0
- transformers/models/maskformer/configuration_maskformer.py +3 -3
- transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
- transformers/models/maskformer/modeling_maskformer.py +11 -1
- transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
- transformers/models/mbart/configuration_mbart.py +1 -0
- transformers/models/mbart/modeling_mbart.py +14 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
- transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
- transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +28 -5
- transformers/models/minimax/modeling_minimax.py +19 -6
- transformers/models/minimax/modular_minimax.py +12 -1
- transformers/models/ministral/modeling_ministral.py +5 -5
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +5 -4
- transformers/models/mistral/modeling_mistral.py +5 -4
- transformers/models/mistral3/modeling_mistral3.py +10 -4
- transformers/models/mistral3/modular_mistral3.py +3 -1
- transformers/models/mixtral/modeling_mixtral.py +15 -7
- transformers/models/mixtral/modular_mixtral.py +6 -2
- transformers/models/mlcd/modeling_mlcd.py +6 -0
- transformers/models/mlcd/modular_mlcd.py +4 -0
- transformers/models/mllama/modeling_mllama.py +15 -4
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
- transformers/models/mobilebert/modeling_mobilebert.py +2 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
- transformers/models/mobilevit/modeling_mobilevit.py +7 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
- transformers/models/modernbert/modeling_modernbert.py +16 -2
- transformers/models/modernbert/modular_modernbert.py +14 -1
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
- transformers/models/moonshine/modeling_moonshine.py +5 -3
- transformers/models/moshi/modeling_moshi.py +26 -53
- transformers/models/mpnet/modeling_mpnet.py +7 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +10 -1
- transformers/models/mt5/configuration_mt5.py +2 -3
- transformers/models/mt5/modeling_mt5.py +7 -10
- transformers/models/musicgen/modeling_musicgen.py +7 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
- transformers/models/mvp/modeling_mvp.py +14 -0
- transformers/models/nanochat/modeling_nanochat.py +5 -5
- transformers/models/nemotron/modeling_nemotron.py +7 -5
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
- transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
- transformers/models/nougat/image_processing_nougat_fast.py +0 -1
- transformers/models/nougat/tokenization_nougat.py +15 -68
- transformers/models/nystromformer/modeling_nystromformer.py +13 -0
- transformers/models/olmo/modeling_olmo.py +5 -5
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +5 -6
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +5 -5
- transformers/models/olmoe/modeling_olmoe.py +15 -7
- transformers/models/olmoe/modular_olmoe.py +4 -2
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
- transformers/models/oneformer/configuration_oneformer.py +3 -3
- transformers/models/oneformer/modeling_oneformer.py +11 -39
- transformers/models/openai/modeling_openai.py +15 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
- transformers/models/ovis2/modeling_ovis2.py +15 -3
- transformers/models/ovis2/modular_ovis2.py +8 -0
- transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
- transformers/models/owlv2/modeling_owlv2.py +11 -3
- transformers/models/owlv2/modular_owlv2.py +0 -2
- transformers/models/owlvit/modeling_owlvit.py +11 -3
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/paligemma/modeling_paligemma.py +25 -17
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +14 -6
- transformers/models/parakeet/modular_parakeet.py +7 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
- transformers/models/patchtst/modeling_patchtst.py +25 -6
- transformers/models/pe_audio/__init__.py +30 -0
- transformers/models/pe_audio/configuration_pe_audio.py +206 -0
- transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
- transformers/models/pe_audio/modeling_pe_audio.py +820 -0
- transformers/models/pe_audio/modular_pe_audio.py +299 -0
- transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
- transformers/models/pe_audio_video/__init__.py +29 -0
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
- transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
- transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
- transformers/models/pe_video/__init__.py +30 -0
- transformers/models/pe_video/configuration_pe_video.py +211 -0
- transformers/models/pe_video/modeling_pe_video.py +636 -0
- transformers/models/pe_video/modular_pe_video.py +219 -0
- transformers/models/pe_video/processing_pe_video.py +10 -0
- transformers/models/pe_video/video_processing_pe_video.py +66 -0
- transformers/models/pegasus/configuration_pegasus.py +1 -0
- transformers/models/pegasus/modeling_pegasus.py +8 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
- transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
- transformers/models/perceiver/modeling_perceiver.py +13 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
- transformers/models/perception_lm/modeling_perception_lm.py +7 -3
- transformers/models/perception_lm/modular_perception_lm.py +7 -3
- transformers/models/persimmon/modeling_persimmon.py +3 -2
- transformers/models/phi/modeling_phi.py +5 -6
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +3 -2
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
- transformers/models/phimoe/modeling_phimoe.py +15 -7
- transformers/models/phimoe/modular_phimoe.py +3 -3
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pix2struct/processing_pix2struct.py +0 -4
- transformers/models/pixio/__init__.py +30 -0
- transformers/models/pixio/configuration_pixio.py +151 -0
- transformers/models/pixio/modeling_pixio.py +507 -0
- transformers/models/pixio/modular_pixio.py +404 -0
- transformers/models/pixtral/modeling_pixtral.py +3 -2
- transformers/models/pixtral/processing_pixtral.py +3 -1
- transformers/models/plbart/configuration_plbart.py +1 -0
- transformers/models/plbart/modeling_plbart.py +13 -0
- transformers/models/plbart/modular_plbart.py +8 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
- transformers/models/poolformer/modeling_poolformer.py +13 -1
- transformers/models/pop2piano/configuration_pop2piano.py +0 -1
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +5 -1
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +5 -5
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
- transformers/models/qwen3/modeling_qwen3.py +5 -5
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
- transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
- transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
- transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
- transformers/models/rag/configuration_rag.py +0 -8
- transformers/models/rag/modeling_rag.py +8 -9
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
- transformers/models/reformer/modeling_reformer.py +13 -1
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +10 -1
- transformers/models/rembert/modeling_rembert.py +13 -1
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +19 -5
- transformers/models/roberta/modeling_roberta.py +3 -0
- transformers/models/roberta/modular_roberta.py +3 -0
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
- transformers/models/roc_bert/modeling_roc_bert.py +3 -0
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/configuration_rt_detr.py +1 -1
- transformers/models/rt_detr/modeling_rt_detr.py +6 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
- transformers/models/rwkv/modeling_rwkv.py +2 -1
- transformers/models/sam/configuration_sam.py +1 -0
- transformers/models/sam/image_processing_sam_fast.py +0 -1
- transformers/models/sam/modeling_sam.py +4 -1
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +7 -3
- transformers/models/sam2/modular_sam2.py +7 -3
- transformers/models/sam2_video/modeling_sam2_video.py +52 -43
- transformers/models/sam2_video/modular_sam2_video.py +32 -18
- transformers/models/sam3/configuration_sam3.py +21 -1
- transformers/models/sam3/modeling_sam3.py +100 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
- transformers/models/sam3_video/configuration_sam3_video.py +14 -0
- transformers/models/sam3_video/modeling_sam3_video.py +4 -3
- transformers/models/sam3_video/processing_sam3_video.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -0
- transformers/models/sam_hq/modeling_sam_hq.py +26 -23
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
- transformers/models/seed_oss/modeling_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +0 -1
- transformers/models/segformer/modeling_segformer.py +6 -3
- transformers/models/segformer/modular_segformer.py +0 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
- transformers/models/siglip/modeling_siglip.py +24 -2
- transformers/models/siglip2/modeling_siglip2.py +67 -41
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +5 -5
- transformers/models/smolvlm/modeling_smolvlm.py +5 -1
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
- transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
- transformers/models/speecht5/modeling_speecht5.py +41 -1
- transformers/models/splinter/modeling_splinter.py +12 -3
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +8 -0
- transformers/models/stablelm/modeling_stablelm.py +4 -2
- transformers/models/starcoder2/modeling_starcoder2.py +5 -4
- transformers/models/superglue/image_processing_superglue_fast.py +1 -2
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +6 -0
- transformers/models/swin/modeling_swin.py +20 -12
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
- transformers/models/swin2sr/modeling_swin2sr.py +51 -33
- transformers/models/swinv2/modeling_swinv2.py +45 -33
- transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
- transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
- transformers/models/t5/configuration_t5.py +7 -1
- transformers/models/t5/modeling_t5.py +8 -7
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +6 -6
- transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
- transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
- transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
- transformers/models/table_transformer/configuration_table_transformer.py +1 -1
- transformers/models/table_transformer/modeling_table_transformer.py +5 -1
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/image_processing_textnet_fast.py +0 -1
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +14 -0
- transformers/models/timesfm/modular_timesfm.py +14 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
- transformers/models/trocr/modeling_trocr.py +3 -2
- transformers/models/tvp/configuration_tvp.py +5 -1
- transformers/models/tvp/modeling_tvp.py +6 -4
- transformers/models/udop/configuration_udop.py +1 -0
- transformers/models/udop/modeling_udop.py +7 -7
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/configuration_umt5.py +2 -2
- transformers/models/umt5/modeling_umt5.py +7 -6
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
- transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
- transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
- transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
- transformers/models/video_llava/modeling_video_llava.py +7 -3
- transformers/models/vilt/configuration_vilt.py +2 -2
- transformers/models/vilt/modeling_vilt.py +13 -0
- transformers/models/vipllava/modeling_vipllava.py +7 -3
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +8 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/configuration_vitmatte.py +1 -1
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
- transformers/models/vitmatte/modeling_vitmatte.py +5 -0
- transformers/models/vitpose/configuration_vitpose.py +1 -1
- transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/voxtral/modeling_voxtral.py +2 -2
- transformers/models/voxtral/modular_voxtral.py +2 -2
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/generation_whisper.py +1 -0
- transformers/models/whisper/modeling_whisper.py +11 -3
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +5 -0
- transformers/models/xcodec/modeling_xcodec.py +5 -0
- transformers/models/xglm/modeling_xglm.py +11 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +18 -14
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
- transformers/models/xlnet/modeling_xlnet.py +3 -1
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/xmod/modeling_xmod.py +3 -0
- transformers/models/yoso/modeling_yoso.py +10 -1
- transformers/models/zamba/modeling_zamba.py +4 -1
- transformers/models/zamba2/modeling_zamba2.py +7 -4
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/configuration_zoedepth.py +1 -1
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
- transformers/models/zoedepth/modeling_zoedepth.py +8 -0
- transformers/pipelines/__init__.py +11 -9
- transformers/pipelines/automatic_speech_recognition.py +20 -12
- transformers/pipelines/base.py +2 -10
- transformers/pipelines/document_question_answering.py +4 -2
- transformers/pipelines/question_answering.py +1 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/processing_utils.py +133 -50
- transformers/quantizers/auto.py +2 -4
- transformers/quantizers/base.py +44 -174
- transformers/quantizers/quantizer_aqlm.py +2 -23
- transformers/quantizers/quantizer_auto_round.py +2 -12
- transformers/quantizers/quantizer_awq.py +20 -89
- transformers/quantizers/quantizer_bitnet.py +4 -14
- transformers/quantizers/quantizer_bnb_4bit.py +18 -155
- transformers/quantizers/quantizer_bnb_8bit.py +24 -110
- transformers/quantizers/quantizer_compressed_tensors.py +2 -9
- transformers/quantizers/quantizer_eetq.py +16 -74
- transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
- transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
- transformers/quantizers/quantizer_fp_quant.py +52 -82
- transformers/quantizers/quantizer_gptq.py +8 -28
- transformers/quantizers/quantizer_higgs.py +42 -60
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +14 -194
- transformers/quantizers/quantizer_quanto.py +35 -79
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +4 -12
- transformers/quantizers/quantizer_torchao.py +50 -325
- transformers/quantizers/quantizer_vptq.py +4 -27
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +324 -47
- transformers/tokenization_mistral_common.py +7 -2
- transformers/tokenization_utils_base.py +116 -224
- transformers/tokenization_utils_tokenizers.py +190 -106
- transformers/trainer.py +51 -32
- transformers/trainer_callback.py +8 -0
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_seq2seq.py +4 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +74 -38
- transformers/utils/__init__.py +7 -4
- transformers/utils/attention_visualizer.py +4 -4
- transformers/utils/auto_docstring.py +35 -25
- transformers/utils/generic.py +47 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +112 -25
- transformers/utils/kernel_config.py +74 -19
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +78 -245
- transformers/video_processing_utils.py +17 -14
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -67,6 +67,10 @@ class CodeGenTokenizer(TokenizersBackend):
|
|
|
67
67
|
refer to this superclass for more information regarding those methods.
|
|
68
68
|
|
|
69
69
|
Args:
|
|
70
|
+
vocab (`str` or `dict[str, int]`, *optional*):
|
|
71
|
+
Custom vocabulary dictionary. If not provided, vocabulary is loaded from `vocab_file`.
|
|
72
|
+
merges (`str` or `list[str]`, *optional*):
|
|
73
|
+
Custom merges list. If not provided, merges are loaded from `merges_file`.
|
|
70
74
|
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
|
71
75
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
|
72
76
|
token instead.
|
|
@@ -79,31 +83,24 @@ class CodeGenTokenizer(TokenizersBackend):
|
|
|
79
83
|
add_prefix_space (`bool`, *optional*, defaults to `False`):
|
|
80
84
|
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
|
|
81
85
|
other word. (CodeGen tokenizer detect beginning of words by the preceding space).
|
|
82
|
-
add_bos_token (`bool`, *optional*, defaults to `False`):
|
|
83
|
-
Whether or not to add an initial beginning of sentence token to the input.
|
|
84
86
|
return_token_type_ids (`bool`, *optional*, defaults to `False`):
|
|
85
87
|
Whether to return token type IDs.
|
|
86
|
-
vocab (`dict`, *optional*):
|
|
87
|
-
Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
|
|
88
|
-
merges (`list`, *optional*):
|
|
89
|
-
Custom merges list. If not provided, merges are loaded from merges_file.
|
|
90
88
|
"""
|
|
91
89
|
|
|
92
90
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
93
91
|
model_input_names = ["input_ids", "attention_mask"]
|
|
94
|
-
|
|
92
|
+
model = BPE
|
|
95
93
|
|
|
96
94
|
def __init__(
|
|
97
95
|
self,
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
96
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
97
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
98
|
+
unk_token: str = "<|endoftext|>",
|
|
99
|
+
bos_token: str = "<|endoftext|>",
|
|
100
|
+
eos_token: str = "<|endoftext|>",
|
|
101
101
|
pad_token=None,
|
|
102
|
-
add_prefix_space=False,
|
|
103
|
-
|
|
104
|
-
return_token_type_ids=False,
|
|
105
|
-
vocab: Optional[dict] = None,
|
|
106
|
-
merges: Optional[list] = None,
|
|
102
|
+
add_prefix_space: bool = False,
|
|
103
|
+
return_token_type_ids: bool = False,
|
|
107
104
|
**kwargs,
|
|
108
105
|
):
|
|
109
106
|
self.return_token_type_ids = return_token_type_ids
|
|
@@ -112,17 +109,8 @@ class CodeGenTokenizer(TokenizersBackend):
|
|
|
112
109
|
|
|
113
110
|
self.add_prefix_space = add_prefix_space
|
|
114
111
|
|
|
115
|
-
if vocab is not None
|
|
116
|
-
|
|
117
|
-
{token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
|
|
118
|
-
)
|
|
119
|
-
else:
|
|
120
|
-
self._vocab = {}
|
|
121
|
-
|
|
122
|
-
if merges is not None:
|
|
123
|
-
self._merges = merges
|
|
124
|
-
else:
|
|
125
|
-
self._merges = []
|
|
112
|
+
self._vocab = vocab if vocab is not None else {}
|
|
113
|
+
self._merges = merges or []
|
|
126
114
|
|
|
127
115
|
self._tokenizer = Tokenizer(
|
|
128
116
|
BPE(
|
|
@@ -141,33 +129,16 @@ class CodeGenTokenizer(TokenizersBackend):
|
|
|
141
129
|
add_prefix_space=True, use_regex=True, trim_offsets=False
|
|
142
130
|
)
|
|
143
131
|
|
|
144
|
-
tokenizer_object = self._tokenizer
|
|
145
|
-
|
|
146
|
-
# Set these before calling super().__init__() so the base class _post_init() can use them
|
|
147
|
-
self._add_bos_token = add_bos_token
|
|
148
|
-
self._add_eos_token = False
|
|
149
|
-
|
|
150
132
|
super().__init__(
|
|
151
|
-
tokenizer_object=tokenizer_object,
|
|
152
133
|
unk_token=unk_token,
|
|
153
134
|
bos_token=bos_token,
|
|
154
135
|
eos_token=eos_token,
|
|
155
136
|
pad_token=pad_token,
|
|
156
137
|
add_prefix_space=add_prefix_space,
|
|
157
|
-
add_bos_token=add_bos_token,
|
|
158
138
|
return_token_type_ids=return_token_type_ids,
|
|
159
139
|
**kwargs,
|
|
160
140
|
)
|
|
161
141
|
|
|
162
|
-
self._post_init()
|
|
163
|
-
|
|
164
|
-
def _post_init(self):
|
|
165
|
-
self._tokenizer.post_processor = processors.ByteLevel(
|
|
166
|
-
add_prefix_space=True, use_regex=True, trim_offsets=False
|
|
167
|
-
)
|
|
168
|
-
# Ensure base class post-init runs to register special/extra tokens, etc.
|
|
169
|
-
super()._post_init()
|
|
170
|
-
|
|
171
142
|
def decode(
|
|
172
143
|
self,
|
|
173
144
|
token_ids: Union[int, list[int], np.ndarray, "torch.Tensor"],
|
|
@@ -36,6 +36,7 @@ from torch import nn
|
|
|
36
36
|
from ...activations import ACT2FN
|
|
37
37
|
from ...cache_utils import Cache, DynamicCache
|
|
38
38
|
from ...generation import GenerationMixin
|
|
39
|
+
from ...integrations import use_kernelized_func
|
|
39
40
|
from ...masking_utils import create_causal_mask
|
|
40
41
|
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
41
42
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
@@ -44,7 +45,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
44
45
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
45
46
|
from ...processing_utils import Unpack
|
|
46
47
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
47
|
-
from ...utils.generic import check_model_inputs
|
|
48
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
48
49
|
from .configuration_cohere import CohereConfig
|
|
49
50
|
|
|
50
51
|
|
|
@@ -82,7 +83,7 @@ class CohereRotaryEmbedding(nn.Module):
|
|
|
82
83
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
83
84
|
|
|
84
85
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
85
|
-
self.original_inv_freq =
|
|
86
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
86
87
|
|
|
87
88
|
@staticmethod
|
|
88
89
|
def compute_default_rope_parameters(
|
|
@@ -121,7 +122,7 @@ class CohereRotaryEmbedding(nn.Module):
|
|
|
121
122
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
122
123
|
|
|
123
124
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
124
|
-
with
|
|
125
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
125
126
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
126
127
|
emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
|
|
127
128
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -222,6 +223,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
|
|
222
223
|
return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
|
|
223
224
|
|
|
224
225
|
|
|
226
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
225
227
|
class CohereAttention(nn.Module):
|
|
226
228
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
227
229
|
|
|
@@ -247,7 +249,6 @@ class CohereAttention(nn.Module):
|
|
|
247
249
|
self.o_proj = nn.Linear(
|
|
248
250
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
249
251
|
)
|
|
250
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
251
252
|
self.use_qk_norm = config.use_qk_norm
|
|
252
253
|
if self.use_qk_norm:
|
|
253
254
|
# When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads
|
|
@@ -36,6 +36,7 @@ from ...modeling_rope_utils import dynamic_rope_update
|
|
|
36
36
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
37
37
|
from ...processing_utils import Unpack
|
|
38
38
|
from ...utils import TransformersKwargs, logging
|
|
39
|
+
from ...utils.generic import maybe_autocast
|
|
39
40
|
from ..llama.modeling_llama import (
|
|
40
41
|
LlamaAttention,
|
|
41
42
|
LlamaForCausalLM,
|
|
@@ -75,7 +76,7 @@ class CohereRotaryEmbedding(LlamaRotaryEmbedding):
|
|
|
75
76
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
76
77
|
|
|
77
78
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
78
|
-
with
|
|
79
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
79
80
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
80
81
|
emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
|
|
81
82
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -100,21 +100,23 @@ class CohereTokenizer(TokenizersBackend):
|
|
|
100
100
|
Whether or not the default system prompt for Cohere tokenizer should be used.
|
|
101
101
|
add_prefix_space (`bool`, *optional*, defaults to `False`):
|
|
102
102
|
Whether or not the tokenizer should automatically add a prefix space
|
|
103
|
-
vocab (`dict`, *optional*):
|
|
103
|
+
vocab (`str`, `dict` or `list`, *optional*):
|
|
104
104
|
Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
|
|
105
|
-
merges (`list`, *optional*):
|
|
106
|
-
Custom merges list. If not provided, merges are loaded from merges_file
|
|
105
|
+
merges (`str` or `list[str]`, *optional*):
|
|
106
|
+
Custom merges list. If not provided, merges are loaded from `merges_file`.
|
|
107
107
|
"""
|
|
108
108
|
|
|
109
109
|
vocab_files_names = VOCAB_FILES_NAMES
|
|
110
110
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
|
111
111
|
padding_side = "left"
|
|
112
112
|
model_input_names = ["input_ids", "attention_mask"]
|
|
113
|
-
|
|
113
|
+
model = BPE
|
|
114
114
|
# No `max_model_input_sizes`
|
|
115
115
|
|
|
116
116
|
def __init__(
|
|
117
117
|
self,
|
|
118
|
+
vocab: Optional[Union[str, dict[str, int]]] = None,
|
|
119
|
+
merges: Optional[Union[str, list[str]]] = None,
|
|
118
120
|
errors: str = "replace",
|
|
119
121
|
unk_token: str = "<UNK>",
|
|
120
122
|
bos_token: str = "<BOS_TOKEN>",
|
|
@@ -123,27 +125,19 @@ class CohereTokenizer(TokenizersBackend):
|
|
|
123
125
|
cls_token: str = "<CLS>",
|
|
124
126
|
sep_token: str = "<SEP>",
|
|
125
127
|
mask_token: str = "<MASK_TOKEN>",
|
|
126
|
-
add_bos_token: bool = True,
|
|
127
|
-
add_eos_token: bool = False,
|
|
128
128
|
use_default_system_prompt: bool = False,
|
|
129
129
|
add_prefix_space: bool = False,
|
|
130
|
-
vocab: Optional[dict] = None,
|
|
131
|
-
merges: Optional[list] = None,
|
|
132
130
|
**kwargs,
|
|
133
131
|
):
|
|
134
|
-
self._add_bos_token = add_bos_token
|
|
135
|
-
self._add_eos_token = add_eos_token
|
|
136
132
|
self.use_default_system_prompt = use_default_system_prompt
|
|
137
133
|
self.add_prefix_space = add_prefix_space
|
|
138
134
|
self.grounded_generation_template = kwargs.pop("grounded_generation_template", None)
|
|
139
135
|
self.tool_use_template = kwargs.pop("tool_use_template", None)
|
|
140
136
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
else:
|
|
146
|
-
self._vocab = {
|
|
137
|
+
self._vocab = (
|
|
138
|
+
vocab
|
|
139
|
+
if vocab is not None
|
|
140
|
+
else {
|
|
147
141
|
str(pad_token): 0,
|
|
148
142
|
str(unk_token): 1,
|
|
149
143
|
str(cls_token): 2,
|
|
@@ -151,12 +145,9 @@ class CohereTokenizer(TokenizersBackend):
|
|
|
151
145
|
str(mask_token): 4,
|
|
152
146
|
str(bos_token): 5,
|
|
153
147
|
}
|
|
148
|
+
)
|
|
154
149
|
|
|
155
|
-
|
|
156
|
-
self._merges = merges
|
|
157
|
-
else:
|
|
158
|
-
self._merges = []
|
|
159
|
-
|
|
150
|
+
self._merges = merges or []
|
|
160
151
|
self._tokenizer = Tokenizer(
|
|
161
152
|
BPE(
|
|
162
153
|
vocab=self._vocab,
|
|
@@ -177,10 +168,7 @@ class CohereTokenizer(TokenizersBackend):
|
|
|
177
168
|
)
|
|
178
169
|
self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=add_prefix_space, trim_offsets=True)
|
|
179
170
|
|
|
180
|
-
tokenizer_object = self._tokenizer
|
|
181
|
-
|
|
182
171
|
super().__init__(
|
|
183
|
-
tokenizer_object=tokenizer_object,
|
|
184
172
|
errors=errors,
|
|
185
173
|
unk_token=unk_token,
|
|
186
174
|
bos_token=bos_token,
|
|
@@ -189,8 +177,6 @@ class CohereTokenizer(TokenizersBackend):
|
|
|
189
177
|
cls_token=cls_token,
|
|
190
178
|
sep_token=sep_token,
|
|
191
179
|
mask_token=mask_token,
|
|
192
|
-
add_bos_token=add_bos_token,
|
|
193
|
-
add_eos_token=add_eos_token,
|
|
194
180
|
use_default_system_prompt=use_default_system_prompt,
|
|
195
181
|
add_prefix_space=add_prefix_space,
|
|
196
182
|
**kwargs,
|
|
@@ -198,22 +184,6 @@ class CohereTokenizer(TokenizersBackend):
|
|
|
198
184
|
|
|
199
185
|
self._post_init()
|
|
200
186
|
|
|
201
|
-
def _post_init(self):
|
|
202
|
-
"""Post-initialization to ensure add_prefix_space is applied correctly."""
|
|
203
|
-
# Re-apply add_prefix_space setting to pre_tokenizer and decoder
|
|
204
|
-
# This is needed because when loading from pretrained, the tokenizer.json
|
|
205
|
-
# has these settings baked in and we need to override them
|
|
206
|
-
self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
|
207
|
-
[
|
|
208
|
-
pre_tokenizers.Digits(individual_digits=True),
|
|
209
|
-
pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, trim_offsets=True),
|
|
210
|
-
]
|
|
211
|
-
)
|
|
212
|
-
self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=self.add_prefix_space, trim_offsets=True)
|
|
213
|
-
|
|
214
|
-
# Call parent to handle AddedToken properties
|
|
215
|
-
super()._post_init()
|
|
216
|
-
|
|
217
187
|
def apply_tool_use_template(
|
|
218
188
|
self,
|
|
219
189
|
conversation: list[dict[str, str]],
|
|
@@ -28,15 +28,15 @@ import torch.nn as nn
|
|
|
28
28
|
from ...activations import ACT2FN
|
|
29
29
|
from ...cache_utils import Cache, DynamicCache
|
|
30
30
|
from ...generation import GenerationMixin
|
|
31
|
+
from ...integrations import use_kernelized_func
|
|
31
32
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
32
|
-
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
33
33
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
34
34
|
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
|
35
35
|
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
36
36
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
37
37
|
from ...processing_utils import Unpack
|
|
38
38
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
39
|
-
from ...utils.generic import check_model_inputs
|
|
39
|
+
from ...utils.generic import check_model_inputs, maybe_autocast
|
|
40
40
|
from .configuration_cohere2 import Cohere2Config
|
|
41
41
|
|
|
42
42
|
|
|
@@ -57,7 +57,7 @@ class Cohere2RotaryEmbedding(nn.Module):
|
|
|
57
57
|
inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
|
|
58
58
|
|
|
59
59
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
|
60
|
-
self.original_inv_freq =
|
|
60
|
+
self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
|
|
61
61
|
|
|
62
62
|
@staticmethod
|
|
63
63
|
def compute_default_rope_parameters(
|
|
@@ -96,7 +96,7 @@ class Cohere2RotaryEmbedding(nn.Module):
|
|
|
96
96
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
97
97
|
|
|
98
98
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
99
|
-
with
|
|
99
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
100
100
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
101
101
|
emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
|
|
102
102
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -198,6 +198,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
|
|
198
198
|
return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
|
|
199
199
|
|
|
200
200
|
|
|
201
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
201
202
|
class Cohere2Attention(nn.Module):
|
|
202
203
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
203
204
|
|
|
@@ -233,7 +234,7 @@ class Cohere2Attention(nn.Module):
|
|
|
233
234
|
attention_mask: Optional[torch.Tensor],
|
|
234
235
|
past_key_values: Optional[Cache] = None,
|
|
235
236
|
cache_position: Optional[torch.LongTensor] = None,
|
|
236
|
-
**kwargs: Unpack[
|
|
237
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
237
238
|
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
|
238
239
|
input_shape = hidden_states.shape[:-1]
|
|
239
240
|
hidden_shape = (*input_shape, -1, self.head_dim)
|
|
@@ -304,7 +305,7 @@ class Cohere2DecoderLayer(GradientCheckpointingLayer):
|
|
|
304
305
|
past_key_values: Optional[Cache] = None,
|
|
305
306
|
use_cache: Optional[bool] = False,
|
|
306
307
|
cache_position: Optional[torch.LongTensor] = None,
|
|
307
|
-
**kwargs: Unpack[
|
|
308
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
308
309
|
) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
|
309
310
|
"""
|
|
310
311
|
Args:
|
|
@@ -398,7 +399,7 @@ class Cohere2Model(Cohere2PreTrainedModel):
|
|
|
398
399
|
if inputs_embeds is None:
|
|
399
400
|
inputs_embeds = self.embed_tokens(input_ids)
|
|
400
401
|
|
|
401
|
-
if use_cache and past_key_values is None
|
|
402
|
+
if use_cache and past_key_values is None:
|
|
402
403
|
past_key_values = DynamicCache(config=self.config)
|
|
403
404
|
|
|
404
405
|
if cache_position is None:
|
|
@@ -22,7 +22,6 @@ import torch.nn as nn
|
|
|
22
22
|
from ...cache_utils import Cache, DynamicCache
|
|
23
23
|
from ...configuration_utils import PreTrainedConfig, layer_type_validation
|
|
24
24
|
from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
|
|
25
|
-
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
|
26
25
|
from ...modeling_outputs import BaseModelOutputWithPast
|
|
27
26
|
from ...modeling_rope_utils import (
|
|
28
27
|
RopeParameters,
|
|
@@ -31,6 +30,7 @@ from ...modeling_rope_utils import (
|
|
|
31
30
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
|
32
31
|
from ...processing_utils import Unpack
|
|
33
32
|
from ...utils import TransformersKwargs, logging
|
|
33
|
+
from ...utils.generic import maybe_autocast
|
|
34
34
|
from ..cohere.modeling_cohere import (
|
|
35
35
|
CohereAttention,
|
|
36
36
|
CohereDecoderLayer,
|
|
@@ -223,7 +223,7 @@ class Cohere2RotaryEmbedding(CohereRotaryEmbedding):
|
|
|
223
223
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
224
224
|
|
|
225
225
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
226
|
-
with
|
|
226
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
227
227
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
228
228
|
emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
|
|
229
229
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -271,7 +271,7 @@ class Cohere2Attention(CohereAttention):
|
|
|
271
271
|
attention_mask: Optional[torch.Tensor],
|
|
272
272
|
past_key_values: Optional[Cache] = None,
|
|
273
273
|
cache_position: Optional[torch.LongTensor] = None,
|
|
274
|
-
**kwargs: Unpack[
|
|
274
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
275
275
|
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
|
276
276
|
input_shape = hidden_states.shape[:-1]
|
|
277
277
|
hidden_shape = (*input_shape, -1, self.head_dim)
|
|
@@ -322,7 +322,7 @@ class Cohere2DecoderLayer(CohereDecoderLayer):
|
|
|
322
322
|
past_key_values: Optional[Cache] = None,
|
|
323
323
|
use_cache: Optional[bool] = False,
|
|
324
324
|
cache_position: Optional[torch.LongTensor] = None,
|
|
325
|
-
**kwargs: Unpack[
|
|
325
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
326
326
|
) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
|
327
327
|
residual = hidden_states
|
|
328
328
|
hidden_states = self.input_layernorm(hidden_states)
|
|
@@ -367,7 +367,7 @@ class Cohere2Model(Gemma2Model):
|
|
|
367
367
|
if inputs_embeds is None:
|
|
368
368
|
inputs_embeds = self.embed_tokens(input_ids)
|
|
369
369
|
|
|
370
|
-
if use_cache and past_key_values is None
|
|
370
|
+
if use_cache and past_key_values is None:
|
|
371
371
|
past_key_values = DynamicCache(config=self.config)
|
|
372
372
|
|
|
373
373
|
if cache_position is None:
|
|
@@ -93,8 +93,9 @@ def get_optimal_tiled_canvas(
|
|
|
93
93
|
patch_size_height, patch_size_width = target_tile_size # (height == width)
|
|
94
94
|
|
|
95
95
|
candidate_resolutions = np.array(possible_resolutions) * patch_size_height
|
|
96
|
-
|
|
97
|
-
|
|
96
|
+
# tiles following (width, height) order to align with aspect ratio convention
|
|
97
|
+
tile_size = np.stack([image_width, image_height])
|
|
98
|
+
required_scales = candidate_resolutions / tile_size
|
|
98
99
|
required_scale = np.min(required_scales, axis=-1, keepdims=True) # [n_resolutions, 1]
|
|
99
100
|
if np.all(required_scale < 1):
|
|
100
101
|
# We are forced to downscale, so try to minimize the amount of downscaling
|
|
@@ -103,7 +104,7 @@ def get_optimal_tiled_canvas(
|
|
|
103
104
|
# Pick the resolution that required the least upscaling so that it most closely fits the image
|
|
104
105
|
required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
|
|
105
106
|
best_grid = possible_resolutions[np.argmin(required_scale)]
|
|
106
|
-
return best_grid
|
|
107
|
+
return best_grid # (width, height)
|
|
107
108
|
|
|
108
109
|
|
|
109
110
|
@auto_docstring
|
|
@@ -262,7 +263,6 @@ class Cohere2VisionImageProcessorFast(BaseImageProcessorFast):
|
|
|
262
263
|
processed_images_grouped[shape] = stacked_images
|
|
263
264
|
|
|
264
265
|
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
265
|
-
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
|
266
266
|
|
|
267
267
|
return BatchFeature(
|
|
268
268
|
data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
|
|
@@ -376,6 +376,7 @@ class Cohere2VisionForConditionalGeneration(Cohere2VisionPreTrainedModel, Genera
|
|
|
376
376
|
attention_mask=None,
|
|
377
377
|
cache_position=None,
|
|
378
378
|
logits_to_keep=None,
|
|
379
|
+
is_first_iteration=False,
|
|
379
380
|
**kwargs,
|
|
380
381
|
):
|
|
381
382
|
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -387,12 +388,15 @@ class Cohere2VisionForConditionalGeneration(Cohere2VisionPreTrainedModel, Genera
|
|
|
387
388
|
attention_mask=attention_mask,
|
|
388
389
|
cache_position=cache_position,
|
|
389
390
|
logits_to_keep=logits_to_keep,
|
|
391
|
+
is_first_iteration=is_first_iteration,
|
|
390
392
|
**kwargs,
|
|
391
393
|
)
|
|
392
394
|
|
|
393
|
-
if
|
|
394
|
-
#
|
|
395
|
-
#
|
|
395
|
+
if is_first_iteration or not kwargs.get("use_cache", True):
|
|
396
|
+
# Pixel values are used only in the first iteration if available
|
|
397
|
+
# In subsquent iterations, they are already merged with text and cached
|
|
398
|
+
# NOTE: first iteration doesn't have to be prefill, it can be the first
|
|
399
|
+
# iteration with a question and cached system prompt (continue generate from cache)
|
|
396
400
|
model_inputs["pixel_values"] = pixel_values
|
|
397
401
|
|
|
398
402
|
return model_inputs
|
|
@@ -295,8 +295,9 @@ def get_optimal_tiled_canvas(
|
|
|
295
295
|
patch_size_height, patch_size_width = target_tile_size # (height == width)
|
|
296
296
|
|
|
297
297
|
candidate_resolutions = np.array(possible_resolutions) * patch_size_height
|
|
298
|
-
|
|
299
|
-
|
|
298
|
+
# tiles following (width, height) order to align with aspect ratio convention
|
|
299
|
+
tile_size = np.stack([image_width, image_height])
|
|
300
|
+
required_scales = candidate_resolutions / tile_size
|
|
300
301
|
required_scale = np.min(required_scales, axis=-1, keepdims=True) # [n_resolutions, 1]
|
|
301
302
|
if np.all(required_scale < 1):
|
|
302
303
|
# We are forced to downscale, so try to minimize the amount of downscaling
|
|
@@ -305,7 +306,7 @@ def get_optimal_tiled_canvas(
|
|
|
305
306
|
# Pick the resolution that required the least upscaling so that it most closely fits the image
|
|
306
307
|
required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
|
|
307
308
|
best_grid = possible_resolutions[np.argmin(required_scale)]
|
|
308
|
-
return best_grid
|
|
309
|
+
return best_grid # (width, height)
|
|
309
310
|
|
|
310
311
|
|
|
311
312
|
class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
|
|
@@ -141,6 +141,7 @@ class ColQwen2ForRetrieval(ColQwen2PreTrainedModel):
|
|
|
141
141
|
pixel_values: Optional[torch.Tensor] = None,
|
|
142
142
|
image_grid_thw: Optional[torch.LongTensor] = None,
|
|
143
143
|
cache_position: Optional[torch.LongTensor] = None,
|
|
144
|
+
**kwargs,
|
|
144
145
|
) -> ColQwen2ForRetrievalOutput:
|
|
145
146
|
r"""
|
|
146
147
|
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
@@ -322,6 +322,7 @@ class ColQwen2ForRetrieval(ColPaliForRetrieval):
|
|
|
322
322
|
pixel_values: Optional[torch.Tensor] = None,
|
|
323
323
|
image_grid_thw: Optional[torch.LongTensor] = None,
|
|
324
324
|
cache_position: Optional[torch.LongTensor] = None,
|
|
325
|
+
**kwargs,
|
|
325
326
|
) -> ColQwen2ForRetrievalOutput:
|
|
326
327
|
r"""
|
|
327
328
|
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
@@ -37,7 +37,7 @@ class ConditionalDetrConfig(PreTrainedConfig):
|
|
|
37
37
|
use_timm_backbone (`bool`, *optional*, defaults to `True`):
|
|
38
38
|
Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
|
|
39
39
|
API.
|
|
40
|
-
backbone_config (`PreTrainedConfig
|
|
40
|
+
backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `ResNetConfig()`):
|
|
41
41
|
The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
|
|
42
42
|
case it will default to `ResNetConfig()`.
|
|
43
43
|
num_channels (`int`, *optional*, defaults to 3):
|
|
@@ -984,7 +984,7 @@ class ConditionalDetrPreTrainedModel(PreTrainedModel):
|
|
|
984
984
|
elif isinstance(module, ConditionalDetrLearnedPositionEmbedding):
|
|
985
985
|
init.uniform_(module.row_embeddings.weight)
|
|
986
986
|
init.uniform_(module.column_embeddings.weight)
|
|
987
|
-
if isinstance(module, (nn.Linear, nn.Conv2d
|
|
987
|
+
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
|
988
988
|
init.normal_(module.weight, mean=0.0, std=std)
|
|
989
989
|
if module.bias is not None:
|
|
990
990
|
init.zeros_(module.bias)
|
|
@@ -993,6 +993,9 @@ class ConditionalDetrPreTrainedModel(PreTrainedModel):
|
|
|
993
993
|
# Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
|
|
994
994
|
if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
|
|
995
995
|
init.zeros_(module.weight[module.padding_idx])
|
|
996
|
+
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
|
|
997
|
+
init.ones_(module.weight)
|
|
998
|
+
init.zeros_(module.bias)
|
|
996
999
|
|
|
997
1000
|
|
|
998
1001
|
# Copied from transformers.models.detr.modeling_detr.DetrEncoder with Detr->ConditionalDetr,DETR->ConditionalDETR
|
|
@@ -1032,6 +1035,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
|
|
|
1032
1035
|
output_attentions=None,
|
|
1033
1036
|
output_hidden_states=None,
|
|
1034
1037
|
return_dict=None,
|
|
1038
|
+
**kwargs,
|
|
1035
1039
|
):
|
|
1036
1040
|
r"""
|
|
1037
1041
|
Args:
|
|
@@ -1156,6 +1160,7 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
|
|
|
1156
1160
|
output_attentions=None,
|
|
1157
1161
|
output_hidden_states=None,
|
|
1158
1162
|
return_dict=None,
|
|
1163
|
+
**kwargs,
|
|
1159
1164
|
):
|
|
1160
1165
|
r"""
|
|
1161
1166
|
Args:
|
|
@@ -1344,6 +1349,7 @@ class ConditionalDetrModel(ConditionalDetrPreTrainedModel):
|
|
|
1344
1349
|
output_attentions: Optional[bool] = None,
|
|
1345
1350
|
output_hidden_states: Optional[bool] = None,
|
|
1346
1351
|
return_dict: Optional[bool] = None,
|
|
1352
|
+
**kwargs,
|
|
1347
1353
|
) -> Union[tuple[torch.FloatTensor], ConditionalDetrModelOutput]:
|
|
1348
1354
|
r"""
|
|
1349
1355
|
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
|
|
@@ -1529,6 +1535,7 @@ class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
|
|
|
1529
1535
|
output_attentions: Optional[bool] = None,
|
|
1530
1536
|
output_hidden_states: Optional[bool] = None,
|
|
1531
1537
|
return_dict: Optional[bool] = None,
|
|
1538
|
+
**kwargs,
|
|
1532
1539
|
) -> Union[tuple[torch.FloatTensor], ConditionalDetrObjectDetectionOutput]:
|
|
1533
1540
|
r"""
|
|
1534
1541
|
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
|
|
@@ -1693,6 +1700,7 @@ class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel):
|
|
|
1693
1700
|
output_attentions: Optional[bool] = None,
|
|
1694
1701
|
output_hidden_states: Optional[bool] = None,
|
|
1695
1702
|
return_dict: Optional[bool] = None,
|
|
1703
|
+
**kwargs,
|
|
1696
1704
|
) -> Union[tuple[torch.FloatTensor], ConditionalDetrSegmentationOutput]:
|
|
1697
1705
|
r"""
|
|
1698
1706
|
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
|
|
@@ -118,6 +118,9 @@ class ConvBertPreTrainedModel(PreTrainedModel):
|
|
|
118
118
|
elif isinstance(module, GroupedLinearLayer):
|
|
119
119
|
init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
|
|
120
120
|
init.zeros_(module.bias)
|
|
121
|
+
elif isinstance(module, ConvBertEmbeddings):
|
|
122
|
+
init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
|
|
123
|
+
init.zeros_(module.token_type_ids)
|
|
121
124
|
|
|
122
125
|
|
|
123
126
|
class SeparableConv1D(nn.Module):
|
|
@@ -629,6 +632,7 @@ class ConvBertModel(ConvBertPreTrainedModel):
|
|
|
629
632
|
output_attentions: Optional[bool] = None,
|
|
630
633
|
output_hidden_states: Optional[bool] = None,
|
|
631
634
|
return_dict: Optional[bool] = None,
|
|
635
|
+
**kwargs,
|
|
632
636
|
) -> Union[tuple, BaseModelOutputWithCrossAttentions]:
|
|
633
637
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
634
638
|
output_hidden_states = (
|
|
@@ -729,6 +733,7 @@ class ConvBertForMaskedLM(ConvBertPreTrainedModel):
|
|
|
729
733
|
output_attentions: Optional[bool] = None,
|
|
730
734
|
output_hidden_states: Optional[bool] = None,
|
|
731
735
|
return_dict: Optional[bool] = None,
|
|
736
|
+
**kwargs,
|
|
732
737
|
) -> Union[tuple, MaskedLMOutput]:
|
|
733
738
|
r"""
|
|
734
739
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -824,6 +829,7 @@ class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
|
|
|
824
829
|
output_attentions: Optional[bool] = None,
|
|
825
830
|
output_hidden_states: Optional[bool] = None,
|
|
826
831
|
return_dict: Optional[bool] = None,
|
|
832
|
+
**kwargs,
|
|
827
833
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
828
834
|
r"""
|
|
829
835
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -906,6 +912,7 @@ class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
|
|
|
906
912
|
output_attentions: Optional[bool] = None,
|
|
907
913
|
output_hidden_states: Optional[bool] = None,
|
|
908
914
|
return_dict: Optional[bool] = None,
|
|
915
|
+
**kwargs,
|
|
909
916
|
) -> Union[tuple, MultipleChoiceModelOutput]:
|
|
910
917
|
r"""
|
|
911
918
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -1013,6 +1020,7 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel):
|
|
|
1013
1020
|
output_attentions: Optional[bool] = None,
|
|
1014
1021
|
output_hidden_states: Optional[bool] = None,
|
|
1015
1022
|
return_dict: Optional[bool] = None,
|
|
1023
|
+
**kwargs,
|
|
1016
1024
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1017
1025
|
r"""
|
|
1018
1026
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1078,6 +1086,7 @@ class ConvBertForQuestionAnswering(ConvBertPreTrainedModel):
|
|
|
1078
1086
|
output_attentions: Optional[bool] = None,
|
|
1079
1087
|
output_hidden_states: Optional[bool] = None,
|
|
1080
1088
|
return_dict: Optional[bool] = None,
|
|
1089
|
+
**kwargs,
|
|
1081
1090
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
1082
1091
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
1083
1092
|
|
|
@@ -78,7 +78,7 @@ class ConvNextImageProcessor(BaseImageProcessor):
|
|
|
78
78
|
crop_pct (`float` *optional*, defaults to 224 / 256):
|
|
79
79
|
Percentage of the image to crop. Only has an effect if `do_resize` is `True` and size < 384. Can be
|
|
80
80
|
overridden by `crop_pct` in the `preprocess` method.
|
|
81
|
-
resample (`PILImageResampling`, *optional*, defaults to `Resampling.
|
|
81
|
+
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
|
82
82
|
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
|
|
83
83
|
do_rescale (`bool`, *optional*, defaults to `True`):
|
|
84
84
|
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
|
@@ -105,7 +105,7 @@ class ConvNextImageProcessor(BaseImageProcessor):
|
|
|
105
105
|
do_resize: bool = True,
|
|
106
106
|
size: Optional[dict[str, int]] = None,
|
|
107
107
|
crop_pct: Optional[float] = None,
|
|
108
|
-
resample: PILImageResampling = PILImageResampling.
|
|
108
|
+
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
|
109
109
|
do_rescale: bool = True,
|
|
110
110
|
rescale_factor: Union[int, float] = 1 / 255,
|
|
111
111
|
do_normalize: bool = True,
|