transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +30 -3
- transformers/cli/serve.py +47 -17
- transformers/conversion_mapping.py +15 -2
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +196 -135
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +1 -2
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +1 -2
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/configuration_utils.py +3 -2
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/continuous_api.py +134 -79
- transformers/image_processing_base.py +1 -2
- transformers/integrations/__init__.py +4 -2
- transformers/integrations/accelerate.py +15 -3
- transformers/integrations/aqlm.py +38 -66
- transformers/integrations/awq.py +48 -514
- transformers/integrations/bitnet.py +45 -100
- transformers/integrations/bitsandbytes.py +79 -191
- transformers/integrations/deepspeed.py +1 -0
- transformers/integrations/eetq.py +84 -79
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +236 -193
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +40 -62
- transformers/integrations/hub_kernels.py +42 -3
- transformers/integrations/integration_utils.py +10 -0
- transformers/integrations/mxfp4.py +25 -65
- transformers/integrations/peft.py +7 -29
- transformers/integrations/quanto.py +73 -55
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +44 -90
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +42 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +8 -0
- transformers/modeling_rope_utils.py +30 -6
- transformers/modeling_utils.py +116 -112
- transformers/models/__init__.py +3 -0
- transformers/models/afmoe/modeling_afmoe.py +4 -4
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +2 -0
- transformers/models/altclip/modeling_altclip.py +4 -0
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/modeling_aria.py +4 -4
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/auto/configuration_auto.py +11 -0
- transformers/models/auto/feature_extraction_auto.py +2 -0
- transformers/models/auto/image_processing_auto.py +1 -0
- transformers/models/auto/modeling_auto.py +6 -0
- transformers/models/auto/processing_auto.py +18 -10
- transformers/models/auto/tokenization_auto.py +74 -472
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/bamba/modeling_bamba.py +4 -3
- transformers/models/bark/modeling_bark.py +2 -0
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/big_bird/modeling_big_bird.py +6 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +11 -2
- transformers/models/bitnet/modeling_bitnet.py +4 -4
- transformers/models/blenderbot/modeling_blenderbot.py +5 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
- transformers/models/blip/modeling_blip_text.py +2 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -1
- transformers/models/bloom/modeling_bloom.py +4 -0
- transformers/models/blt/modeling_blt.py +2 -2
- transformers/models/blt/modular_blt.py +2 -2
- transformers/models/bridgetower/modeling_bridgetower.py +5 -1
- transformers/models/bros/modeling_bros.py +4 -0
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +5 -0
- transformers/models/chameleon/modeling_chameleon.py +2 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
- transformers/models/clap/modeling_clap.py +5 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +5 -0
- transformers/models/clvp/modeling_clvp.py +5 -0
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +4 -3
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +7 -6
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
- transformers/models/convbert/modeling_convbert.py +6 -0
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/modeling_csm.py +4 -3
- transformers/models/ctrl/modeling_ctrl.py +1 -0
- transformers/models/cvt/modeling_cvt.py +2 -0
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/modeling_d_fine.py +2 -0
- transformers/models/d_fine/modular_d_fine.py +1 -0
- transformers/models/dab_detr/modeling_dab_detr.py +4 -0
- transformers/models/dac/modeling_dac.py +2 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/dbrx/modeling_dbrx.py +2 -2
- transformers/models/deberta/modeling_deberta.py +5 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
- transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
- transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
- transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/modeling_detr.py +5 -0
- transformers/models/dia/modeling_dia.py +4 -3
- transformers/models/dia/modular_dia.py +0 -1
- transformers/models/diffllama/modeling_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +2 -3
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +2 -0
- transformers/models/dots1/modeling_dots1.py +10 -7
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/edgetam/modeling_edgetam.py +1 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
- transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
- transformers/models/efficientnet/modeling_efficientnet.py +2 -0
- transformers/models/emu3/modeling_emu3.py +4 -4
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +14 -2
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
- transformers/models/esm/modeling_esmfold.py +5 -4
- transformers/models/evolla/modeling_evolla.py +4 -4
- transformers/models/exaone4/modeling_exaone4.py +2 -2
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +6 -1
- transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
- transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
- transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
- transformers/models/flaubert/modeling_flaubert.py +7 -0
- transformers/models/flava/modeling_flava.py +6 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
- transformers/models/florence2/modeling_florence2.py +2 -1
- transformers/models/florence2/modular_florence2.py +2 -1
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/processing_fuyu.py +3 -3
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +2 -1
- transformers/models/gemma3/modeling_gemma3.py +14 -84
- transformers/models/gemma3/modular_gemma3.py +12 -81
- transformers/models/gemma3n/modeling_gemma3n.py +18 -209
- transformers/models/gemma3n/modular_gemma3n.py +17 -59
- transformers/models/git/modeling_git.py +2 -0
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/modeling_glm4v.py +3 -3
- transformers/models/glm4v/modular_glm4v.py +6 -4
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
- transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/gpt2/modeling_gpt2.py +5 -1
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
- transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
- transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
- transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
- transformers/models/gptj/modeling_gptj.py +3 -0
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granitemoe/modeling_granitemoe.py +4 -6
- transformers/models/granitemoe/modular_granitemoe.py +0 -2
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
- transformers/models/groupvit/modeling_groupvit.py +3 -0
- transformers/models/helium/modeling_helium.py +4 -3
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +3 -0
- transformers/models/hubert/modular_hubert.py +1 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
- transformers/models/ibert/modeling_ibert.py +6 -0
- transformers/models/idefics/modeling_idefics.py +5 -21
- transformers/models/imagegpt/modeling_imagegpt.py +2 -1
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/internvl/modeling_internvl.py +2 -4
- transformers/models/internvl/modular_internvl.py +2 -4
- transformers/models/jamba/modeling_jamba.py +2 -2
- transformers/models/janus/modeling_janus.py +1 -0
- transformers/models/janus/modular_janus.py +1 -0
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/kosmos2/modeling_kosmos2.py +1 -0
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +244 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +729 -0
- transformers/models/lasr/modular_lasr.py +569 -0
- transformers/models/lasr/processing_lasr.py +96 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +5 -0
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +3 -0
- transformers/models/lfm2/modeling_lfm2.py +4 -5
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +4 -0
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/modeling_llama4.py +3 -2
- transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
- transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -0
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +4 -0
- transformers/models/mamba/modeling_mamba.py +14 -22
- transformers/models/marian/modeling_marian.py +5 -0
- transformers/models/markuplm/modeling_markuplm.py +4 -0
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/modeling_mask2former.py +2 -0
- transformers/models/maskformer/modeling_maskformer.py +2 -0
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +3 -1
- transformers/models/minimax/modeling_minimax.py +4 -4
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +4 -3
- transformers/models/mistral/modeling_mistral.py +4 -3
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mllama/modeling_mllama.py +2 -2
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/modeling_mobilevit.py +3 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
- transformers/models/modernbert/modeling_modernbert.py +4 -1
- transformers/models/modernbert/modular_modernbert.py +2 -0
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
- transformers/models/moonshine/modeling_moonshine.py +4 -2
- transformers/models/moshi/modeling_moshi.py +5 -2
- transformers/models/mpnet/modeling_mpnet.py +5 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +6 -0
- transformers/models/mt5/modeling_mt5.py +7 -0
- transformers/models/musicgen/modeling_musicgen.py +2 -0
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nemotron/modeling_nemotron.py +4 -2
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nougat/tokenization_nougat.py +11 -59
- transformers/models/nystromformer/modeling_nystromformer.py +6 -0
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +4 -5
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
- transformers/models/oneformer/modeling_oneformer.py +4 -1
- transformers/models/openai/modeling_openai.py +3 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/owlv2/modeling_owlv2.py +4 -0
- transformers/models/owlvit/modeling_owlvit.py +4 -0
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +9 -6
- transformers/models/parakeet/modular_parakeet.py +2 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
- transformers/models/patchtst/modeling_patchtst.py +20 -2
- transformers/models/pegasus/modeling_pegasus.py +5 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
- transformers/models/perceiver/modeling_perceiver.py +8 -0
- transformers/models/persimmon/modeling_persimmon.py +2 -1
- transformers/models/phi/modeling_phi.py +4 -5
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +2 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
- transformers/models/phimoe/modeling_phimoe.py +4 -4
- transformers/models/phimoe/modular_phimoe.py +2 -2
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pixtral/modeling_pixtral.py +2 -1
- transformers/models/plbart/modeling_plbart.py +6 -0
- transformers/models/plbart/modular_plbart.py +2 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/modeling_poolformer.py +2 -0
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +3 -0
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
- transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
- transformers/models/rag/modeling_rag.py +1 -0
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
- transformers/models/reformer/modeling_reformer.py +4 -0
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +6 -1
- transformers/models/rembert/modeling_rembert.py +6 -0
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +11 -2
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/modeling_rt_detr.py +2 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
- transformers/models/rwkv/modeling_rwkv.py +1 -0
- transformers/models/sam2/modeling_sam2.py +2 -2
- transformers/models/sam2/modular_sam2.py +2 -2
- transformers/models/sam2_video/modeling_sam2_video.py +1 -0
- transformers/models/sam2_video/modular_sam2_video.py +1 -0
- transformers/models/sam3/modeling_sam3.py +77 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
- transformers/models/sam3_video/modeling_sam3_video.py +1 -0
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
- transformers/models/seed_oss/modeling_seed_oss.py +2 -2
- transformers/models/segformer/modeling_segformer.py +4 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/siglip2/modeling_siglip2.py +4 -0
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
- transformers/models/speecht5/modeling_speecht5.py +13 -1
- transformers/models/splinter/modeling_splinter.py +3 -0
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +6 -0
- transformers/models/stablelm/modeling_stablelm.py +3 -1
- transformers/models/starcoder2/modeling_starcoder2.py +4 -3
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +2 -0
- transformers/models/swin/modeling_swin.py +4 -0
- transformers/models/swin2sr/modeling_swin2sr.py +2 -0
- transformers/models/swinv2/modeling_swinv2.py +4 -0
- transformers/models/t5/modeling_t5.py +7 -0
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +5 -5
- transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
- transformers/models/table_transformer/modeling_table_transformer.py +4 -0
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +2 -0
- transformers/models/timesfm/modular_timesfm.py +2 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
- transformers/models/trocr/modeling_trocr.py +2 -0
- transformers/models/tvp/modeling_tvp.py +2 -0
- transformers/models/udop/modeling_udop.py +4 -0
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/modeling_umt5.py +7 -0
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/vilt/modeling_vilt.py +6 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +6 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/modeling_vitmatte.py +1 -0
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/modeling_whisper.py +6 -0
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +3 -0
- transformers/models/xglm/modeling_xglm.py +1 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +5 -0
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/yoso/modeling_yoso.py +6 -0
- transformers/models/zamba/modeling_zamba.py +2 -0
- transformers/models/zamba2/modeling_zamba2.py +4 -2
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/modeling_zoedepth.py +1 -0
- transformers/pipelines/__init__.py +2 -3
- transformers/pipelines/base.py +1 -9
- transformers/pipelines/document_question_answering.py +3 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/processing_utils.py +23 -11
- transformers/quantizers/base.py +35 -110
- transformers/quantizers/quantizer_aqlm.py +1 -5
- transformers/quantizers/quantizer_auto_round.py +1 -2
- transformers/quantizers/quantizer_awq.py +17 -81
- transformers/quantizers/quantizer_bitnet.py +3 -8
- transformers/quantizers/quantizer_bnb_4bit.py +13 -110
- transformers/quantizers/quantizer_bnb_8bit.py +16 -92
- transformers/quantizers/quantizer_compressed_tensors.py +1 -5
- transformers/quantizers/quantizer_eetq.py +14 -62
- transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
- transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
- transformers/quantizers/quantizer_fp_quant.py +48 -78
- transformers/quantizers/quantizer_gptq.py +7 -24
- transformers/quantizers/quantizer_higgs.py +40 -54
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +13 -167
- transformers/quantizers/quantizer_quanto.py +20 -64
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +1 -4
- transformers/quantizers/quantizer_torchao.py +23 -202
- transformers/quantizers/quantizer_vptq.py +8 -22
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +297 -36
- transformers/tokenization_mistral_common.py +4 -0
- transformers/tokenization_utils_base.py +113 -222
- transformers/tokenization_utils_tokenizers.py +168 -107
- transformers/trainer.py +28 -31
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +66 -28
- transformers/utils/__init__.py +3 -4
- transformers/utils/auto_docstring.py +1 -0
- transformers/utils/generic.py +27 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +61 -16
- transformers/utils/kernel_config.py +4 -2
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +75 -242
- transformers/video_processing_utils.py +1 -2
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Optional, Union
|
|
16
|
+
|
|
17
|
+
import torch
|
|
18
|
+
from torch import nn
|
|
19
|
+
|
|
20
|
+
from ...activations import ACT2FN
|
|
21
|
+
from ...configuration_utils import PreTrainedConfig
|
|
22
|
+
from ...utils import auto_docstring
|
|
23
|
+
from ..auto import CONFIG_MAPPING
|
|
24
|
+
from ..llava.configuration_llava import LlavaConfig
|
|
25
|
+
from ..llava.modeling_llava import (
|
|
26
|
+
LlavaForConditionalGeneration,
|
|
27
|
+
LlavaModel,
|
|
28
|
+
LlavaMultiModalProjector,
|
|
29
|
+
LlavaPreTrainedModel,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class FastVlmConfig(LlavaConfig):
|
|
34
|
+
r"""
|
|
35
|
+
This is the configuration class to store the configuration of a [`FastVlmForConditionalGeneration`]. It is used to instantiate a
|
|
36
|
+
FastVLM model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
37
|
+
with the defaults will yield the same configuration as the one of FastVLM-7B.
|
|
38
|
+
|
|
39
|
+
e.g. [KamilaMila/FastVLM-7B](https://huggingface.co/KamilaMila/FastVLM-7B)
|
|
40
|
+
|
|
41
|
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
|
42
|
+
documentation from [`PretrainedConfig`] for more information.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `TimmWrapperConfig` for `fastvit_mci3`):
|
|
46
|
+
The config object or dictionary of the vision backbone.
|
|
47
|
+
text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
|
|
48
|
+
The config object or dictionary of the text backbone.
|
|
49
|
+
image_token_id (`int`, *optional*, defaults to 151646):
|
|
50
|
+
The image token index to encode the image prompt.
|
|
51
|
+
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
|
|
52
|
+
The activation function used by the multimodal projector.
|
|
53
|
+
vision_feature_select_strategy (`str`, *optional*, defaults to `"full"`):
|
|
54
|
+
The feature selection strategy used to select the vision feature from the vision backbone.
|
|
55
|
+
Only "full" supported.
|
|
56
|
+
vision_feature_layer (`Union[int, list[int]]`, *optional*, defaults to -1):
|
|
57
|
+
The index of the layer to select the vision feature. If multiple indices are provided,
|
|
58
|
+
the vision feature of the corresponding indices will be concatenated to form the
|
|
59
|
+
vision features. Only -1 supported.
|
|
60
|
+
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
|
|
61
|
+
Whether to use bias in the multimodal projector.
|
|
62
|
+
|
|
63
|
+
Example:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
>>> from transformers import FastVlmForConditionalGeneration, FastVlmConfig
|
|
67
|
+
|
|
68
|
+
>>> # Initializing a FastVLM-7B style configuration
|
|
69
|
+
>>> configuration = FastVlmConfig()
|
|
70
|
+
|
|
71
|
+
>>> # Initializing a model from the FastVLM-7B style configuration
|
|
72
|
+
>>> model = FastVlmForConditionalGeneration(configuration)
|
|
73
|
+
|
|
74
|
+
>>> # Accessing the model configuration
|
|
75
|
+
>>> configuration = model.config
|
|
76
|
+
```"""
|
|
77
|
+
|
|
78
|
+
model_type = "fast_vlm"
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
vision_config=None,
|
|
83
|
+
text_config=None,
|
|
84
|
+
image_token_id=151646,
|
|
85
|
+
projector_hidden_act="gelu",
|
|
86
|
+
vision_feature_select_strategy="full",
|
|
87
|
+
vision_feature_layer=-1,
|
|
88
|
+
multimodal_projector_bias=True,
|
|
89
|
+
**kwargs,
|
|
90
|
+
):
|
|
91
|
+
self.image_token_id = image_token_id
|
|
92
|
+
self.projector_hidden_act = projector_hidden_act
|
|
93
|
+
|
|
94
|
+
if vision_feature_select_strategy != "full":
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"Unexpected select feature strategy: {vision_feature_select_strategy}. Only 'full' is supported in FastVLM."
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
if vision_feature_layer != -1:
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"Unexpected vision feature layer: {vision_feature_layer}. Only -1 is supported in FastVLM."
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
self.vision_feature_select_strategy = vision_feature_select_strategy
|
|
105
|
+
self.vision_feature_layer = vision_feature_layer
|
|
106
|
+
|
|
107
|
+
if isinstance(vision_config, dict):
|
|
108
|
+
vision_config["model_type"] = vision_config.get("model_type", "timm_wrapper")
|
|
109
|
+
vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
|
|
110
|
+
elif vision_config is None:
|
|
111
|
+
vision_config = CONFIG_MAPPING["timm_wrapper"](
|
|
112
|
+
architecture="fastvit_mci3",
|
|
113
|
+
do_pooling=True,
|
|
114
|
+
global_pool="avg",
|
|
115
|
+
hidden_size=3072,
|
|
116
|
+
initializer_range=0.02,
|
|
117
|
+
model_args={"inference_mode": True},
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
self.vision_config = vision_config
|
|
121
|
+
|
|
122
|
+
if isinstance(text_config, dict):
|
|
123
|
+
text_config["model_type"] = text_config.get("model_type", "qwen2")
|
|
124
|
+
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
|
|
125
|
+
elif text_config is None:
|
|
126
|
+
text_config = CONFIG_MAPPING["qwen2"](
|
|
127
|
+
hidden_size=3584,
|
|
128
|
+
vocab_size=152128,
|
|
129
|
+
intermediate_size=18944,
|
|
130
|
+
num_attention_heads=28,
|
|
131
|
+
num_key_value_heads=4,
|
|
132
|
+
num_hidden_layers=28,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
self.text_config = text_config
|
|
136
|
+
self.multimodal_projector_bias = multimodal_projector_bias
|
|
137
|
+
|
|
138
|
+
PreTrainedConfig.__init__(**kwargs)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class FastVlmMultiModalProjector(LlavaMultiModalProjector):
|
|
142
|
+
def __init__(self, config: FastVlmConfig):
|
|
143
|
+
nn.Module.__init__()
|
|
144
|
+
self.linear_1 = nn.Linear(
|
|
145
|
+
config.vision_config.hidden_size,
|
|
146
|
+
config.text_config.hidden_size,
|
|
147
|
+
bias=config.multimodal_projector_bias,
|
|
148
|
+
)
|
|
149
|
+
self.act = ACT2FN[config.projector_hidden_act]
|
|
150
|
+
self.linear_2 = nn.Linear(
|
|
151
|
+
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class FastVlmPreTrainedModel(LlavaPreTrainedModel):
|
|
156
|
+
pass
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class FastVlmModel(LlavaModel):
|
|
160
|
+
_checkpoint_conversion_mapping = {}
|
|
161
|
+
|
|
162
|
+
def __init__(self, config: FastVlmConfig):
|
|
163
|
+
super().__init__(config)
|
|
164
|
+
|
|
165
|
+
def get_image_features(
|
|
166
|
+
self,
|
|
167
|
+
pixel_values: torch.FloatTensor,
|
|
168
|
+
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
|
169
|
+
vision_feature_select_strategy: Optional[str] = None,
|
|
170
|
+
**kwargs,
|
|
171
|
+
):
|
|
172
|
+
"""
|
|
173
|
+
Obtains image last hidden states from the vision tower and apply multimodal projection.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
|
|
177
|
+
The tensors corresponding to the input images.
|
|
178
|
+
vision_feature_layer (`Union[int, list[int]]`, *optional*):
|
|
179
|
+
The index/indices of the layer to select the vision feature. Only -1 supported.
|
|
180
|
+
vision_feature_select_strategy (`str`, *optional*):
|
|
181
|
+
The feature selection strategy used to select the vision feature from the vision backbone.
|
|
182
|
+
Only "full" supported.
|
|
183
|
+
Returns:
|
|
184
|
+
image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
|
|
185
|
+
"""
|
|
186
|
+
vision_feature_layer = (
|
|
187
|
+
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
|
|
188
|
+
)
|
|
189
|
+
vision_feature_select_strategy = (
|
|
190
|
+
vision_feature_select_strategy
|
|
191
|
+
if vision_feature_select_strategy is not None
|
|
192
|
+
else self.config.vision_feature_select_strategy
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
|
196
|
+
image_outputs = self.vision_tower(pixel_values, **kwargs)
|
|
197
|
+
|
|
198
|
+
# since the vision tower is hybrid in FastVLM, its output needs to be handled differently from Llava
|
|
199
|
+
selected_image_feature = image_outputs.last_hidden_state
|
|
200
|
+
selected_image_feature = selected_image_feature.flatten(2).permute(0, 2, 1)
|
|
201
|
+
image_features = self.multi_modal_projector(selected_image_feature)
|
|
202
|
+
image_features = list(image_features)
|
|
203
|
+
return image_features
|
|
204
|
+
|
|
205
|
+
def forward(self, **super_kwargs):
|
|
206
|
+
r"""
|
|
207
|
+
vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
|
|
208
|
+
The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
|
|
209
|
+
corresponding indices will be concatenated to form the vision features. Only -1 supported.
|
|
210
|
+
vision_feature_select_strategy (`str`, *optional*):
|
|
211
|
+
The feature selection strategy used to select the vision feature from the vision backbone. Only "full" supported.
|
|
212
|
+
"""
|
|
213
|
+
super().forward(**super_kwargs)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
@auto_docstring(
|
|
217
|
+
custom_intro="""
|
|
218
|
+
The FastVlm model which consists of a vision backbone and a language model.
|
|
219
|
+
"""
|
|
220
|
+
)
|
|
221
|
+
class FastVlmForConditionalGeneration(LlavaForConditionalGeneration):
|
|
222
|
+
_checkpoint_conversion_mapping = {}
|
|
223
|
+
|
|
224
|
+
def forward(self, **super_kwargs):
|
|
225
|
+
r"""
|
|
226
|
+
vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
|
|
227
|
+
The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
|
|
228
|
+
corresponding indices will be concatenated to form the vision features. Only -1 supported.
|
|
229
|
+
vision_feature_select_strategy (`str`, *optional*):
|
|
230
|
+
The feature selection strategy used to select the vision feature from the vision backbone. Only "full" supported.
|
|
231
|
+
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
232
|
+
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
|
233
|
+
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
|
234
|
+
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
|
235
|
+
|
|
236
|
+
Example:
|
|
237
|
+
|
|
238
|
+
```python
|
|
239
|
+
>>> from PIL import Image
|
|
240
|
+
>>> import requests
|
|
241
|
+
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
|
|
242
|
+
>>> import torch
|
|
243
|
+
|
|
244
|
+
>>> device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
245
|
+
|
|
246
|
+
>>> model = AutoModelForImageTextToText.from_pretrained("KamilaMila/FastVLM-0.5B").to(device)
|
|
247
|
+
>>> processor = AutoProcessor.from_pretrained("KamilaMila/FastVLM-0.5B")
|
|
248
|
+
|
|
249
|
+
>>> conversation = [
|
|
250
|
+
{
|
|
251
|
+
"role": "user",
|
|
252
|
+
"content": [
|
|
253
|
+
{"type": "text", "text": "What are these?"},
|
|
254
|
+
{"type": "image"}
|
|
255
|
+
]
|
|
256
|
+
}
|
|
257
|
+
]
|
|
258
|
+
|
|
259
|
+
>>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
|
260
|
+
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
|
|
261
|
+
>>> image = Image.open(requests.get(url, stream=True).raw)
|
|
262
|
+
|
|
263
|
+
>>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
|
|
264
|
+
|
|
265
|
+
>>> # Generate
|
|
266
|
+
>>> generated_ids = model.generate(**inputs, max_new_tokens=15)
|
|
267
|
+
>>> print(processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
|
|
268
|
+
system\n You are a helpful assistant.\n user\n What are these?\n assistant\n The image depicts a traditional Chinese street...
|
|
269
|
+
```"""
|
|
270
|
+
super().forward(**super_kwargs)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
__all__ = ["FastVlmForConditionalGeneration", "FastVlmModel", "FastVlmPreTrainedModel", "FastVlmConfig"]
|
|
@@ -514,7 +514,7 @@ class FastSpeech2ConformerConvolutionModule(nn.Module):
|
|
|
514
514
|
|
|
515
515
|
Args:
|
|
516
516
|
hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.
|
|
517
|
-
attention_mask (`torch.Tensor` of shape `(batch, 1, time)`): Attention mask.
|
|
517
|
+
attention_mask (`torch.Tensor` of shape `(batch, 1, time, time)`): Attention mask.
|
|
518
518
|
|
|
519
519
|
Returns:
|
|
520
520
|
`torch.Tensor`: Output tensor of shape `(batch, time, channels)`.
|
|
@@ -530,7 +530,10 @@ class FastSpeech2ConformerConvolutionModule(nn.Module):
|
|
|
530
530
|
|
|
531
531
|
# Apply padding mask before convolution
|
|
532
532
|
if attention_mask is not None:
|
|
533
|
-
|
|
533
|
+
if attention_mask.dtype == torch.bool:
|
|
534
|
+
all_masked_rows = torch.all(~attention_mask, dim=2)
|
|
535
|
+
else:
|
|
536
|
+
all_masked_rows = torch.all(~(attention_mask == 0.0), dim=2)
|
|
534
537
|
hidden_states = hidden_states.masked_fill(all_masked_rows, 0.0)
|
|
535
538
|
|
|
536
539
|
# 1D Depthwise Conv
|
|
@@ -1118,6 +1121,7 @@ class FastSpeech2ConformerModel(FastSpeech2ConformerPreTrainedModel):
|
|
|
1118
1121
|
return_dict: Optional[bool] = None,
|
|
1119
1122
|
output_attentions: Optional[bool] = None,
|
|
1120
1123
|
output_hidden_states: Optional[bool] = None,
|
|
1124
|
+
**kwargs,
|
|
1121
1125
|
) -> Union[tuple, FastSpeech2ConformerModelOutput]:
|
|
1122
1126
|
r"""
|
|
1123
1127
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -1433,7 +1437,7 @@ class FastSpeech2ConformerHifiGan(PreTrainedModel):
|
|
|
1433
1437
|
waveform.
|
|
1434
1438
|
"""
|
|
1435
1439
|
)
|
|
1436
|
-
def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
|
|
1440
|
+
def forward(self, spectrogram: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
|
|
1437
1441
|
r"""
|
|
1438
1442
|
spectrogram (`torch.FloatTensor`):
|
|
1439
1443
|
Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
|
|
@@ -1509,6 +1513,7 @@ class FastSpeech2ConformerWithHifiGan(PreTrainedModel):
|
|
|
1509
1513
|
return_dict: Optional[bool] = None,
|
|
1510
1514
|
output_attentions: Optional[bool] = None,
|
|
1511
1515
|
output_hidden_states: Optional[bool] = None,
|
|
1516
|
+
**kwargs,
|
|
1512
1517
|
) -> Union[tuple, FastSpeech2ConformerModelOutput]:
|
|
1513
1518
|
r"""
|
|
1514
1519
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
@@ -792,6 +792,7 @@ class FlaubertModel(FlaubertPreTrainedModel):
|
|
|
792
792
|
output_hidden_states: Optional[bool] = None,
|
|
793
793
|
return_dict: Optional[bool] = None,
|
|
794
794
|
cache_position: Optional[torch.Tensor] = None,
|
|
795
|
+
**kwargs,
|
|
795
796
|
) -> Union[tuple, BaseModelOutput]:
|
|
796
797
|
r"""
|
|
797
798
|
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1002,6 +1003,7 @@ class FlaubertWithLMHeadModel(FlaubertPreTrainedModel, GenerationMixin):
|
|
|
1002
1003
|
output_attentions: Optional[bool] = None,
|
|
1003
1004
|
output_hidden_states: Optional[bool] = None,
|
|
1004
1005
|
return_dict: Optional[bool] = None,
|
|
1006
|
+
**kwargs,
|
|
1005
1007
|
) -> Union[tuple, MaskedLMOutput]:
|
|
1006
1008
|
r"""
|
|
1007
1009
|
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1090,6 +1092,7 @@ class FlaubertForSequenceClassification(FlaubertPreTrainedModel):
|
|
|
1090
1092
|
output_attentions: Optional[bool] = None,
|
|
1091
1093
|
output_hidden_states: Optional[bool] = None,
|
|
1092
1094
|
return_dict: Optional[bool] = None,
|
|
1095
|
+
**kwargs,
|
|
1093
1096
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1094
1097
|
r"""
|
|
1095
1098
|
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1195,6 +1198,7 @@ class FlaubertForTokenClassification(FlaubertPreTrainedModel):
|
|
|
1195
1198
|
output_attentions: Optional[bool] = None,
|
|
1196
1199
|
output_hidden_states: Optional[bool] = None,
|
|
1197
1200
|
return_dict: Optional[bool] = None,
|
|
1201
|
+
**kwargs,
|
|
1198
1202
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1199
1203
|
r"""
|
|
1200
1204
|
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1286,6 +1290,7 @@ class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel):
|
|
|
1286
1290
|
output_attentions: Optional[bool] = None,
|
|
1287
1291
|
output_hidden_states: Optional[bool] = None,
|
|
1288
1292
|
return_dict: Optional[bool] = None,
|
|
1293
|
+
**kwargs,
|
|
1289
1294
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
1290
1295
|
r"""
|
|
1291
1296
|
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1423,6 +1428,7 @@ class FlaubertForQuestionAnswering(FlaubertPreTrainedModel):
|
|
|
1423
1428
|
output_attentions: Optional[bool] = None,
|
|
1424
1429
|
output_hidden_states: Optional[bool] = None,
|
|
1425
1430
|
return_dict: Optional[bool] = None,
|
|
1431
|
+
**kwargs,
|
|
1426
1432
|
) -> Union[tuple, FlaubertForQuestionAnsweringOutput]:
|
|
1427
1433
|
r"""
|
|
1428
1434
|
langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1538,6 +1544,7 @@ class FlaubertForMultipleChoice(FlaubertPreTrainedModel):
|
|
|
1538
1544
|
output_attentions: Optional[bool] = None,
|
|
1539
1545
|
output_hidden_states: Optional[bool] = None,
|
|
1540
1546
|
return_dict: Optional[bool] = None,
|
|
1547
|
+
**kwargs,
|
|
1541
1548
|
) -> Union[tuple, MultipleChoiceModelOutput]:
|
|
1542
1549
|
r"""
|
|
1543
1550
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -725,6 +725,7 @@ class FlavaImageModel(FlavaPreTrainedModel):
|
|
|
725
725
|
output_attentions: Optional[bool] = None,
|
|
726
726
|
output_hidden_states: Optional[bool] = None,
|
|
727
727
|
return_dict: Optional[bool] = None,
|
|
728
|
+
**kwargs,
|
|
728
729
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
729
730
|
r"""
|
|
730
731
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
|
|
@@ -804,6 +805,7 @@ class FlavaTextModel(FlavaPreTrainedModel):
|
|
|
804
805
|
output_attentions: Optional[bool] = None,
|
|
805
806
|
output_hidden_states: Optional[bool] = None,
|
|
806
807
|
return_dict: Optional[bool] = None,
|
|
808
|
+
**kwargs,
|
|
807
809
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
808
810
|
r"""
|
|
809
811
|
input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
|
|
@@ -896,6 +898,7 @@ class FlavaMultimodalModel(FlavaPreTrainedModel):
|
|
|
896
898
|
output_attentions: Optional[bool] = None,
|
|
897
899
|
output_hidden_states: Optional[bool] = None,
|
|
898
900
|
return_dict: Optional[bool] = None,
|
|
901
|
+
**kwargs,
|
|
899
902
|
) -> Union[tuple, BaseModelOutputWithPooling]:
|
|
900
903
|
r"""
|
|
901
904
|
hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`):
|
|
@@ -1103,6 +1106,7 @@ class FlavaModel(FlavaPreTrainedModel):
|
|
|
1103
1106
|
output_attentions: Optional[bool] = None,
|
|
1104
1107
|
output_hidden_states: bool = True,
|
|
1105
1108
|
return_dict: Optional[bool] = None,
|
|
1109
|
+
**kwargs,
|
|
1106
1110
|
) -> Union[tuple, FlavaOutput]:
|
|
1107
1111
|
r"""
|
|
1108
1112
|
input_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`):
|
|
@@ -1380,7 +1384,7 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
|
|
|
1380
1384
|
z_logits = self.blocks(pixel_values)
|
|
1381
1385
|
return nn.Softmax(dim=1)(z_logits)
|
|
1382
1386
|
|
|
1383
|
-
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
|
1387
|
+
def forward(self, pixel_values: torch.FloatTensor, **kwargs) -> torch.Tensor:
|
|
1384
1388
|
f"""
|
|
1385
1389
|
Args:
|
|
1386
1390
|
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
|
@@ -1575,6 +1579,7 @@ class FlavaForPreTraining(FlavaPreTrainedModel):
|
|
|
1575
1579
|
output_hidden_states: bool = True,
|
|
1576
1580
|
return_dict: Optional[bool] = None,
|
|
1577
1581
|
return_loss: Optional[bool] = None,
|
|
1582
|
+
**kwargs,
|
|
1578
1583
|
) -> Union[tuple[torch.Tensor], FlavaForPreTrainingOutput]:
|
|
1579
1584
|
r"""
|
|
1580
1585
|
input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`):
|
|
@@ -30,7 +30,7 @@ from ... import initialization as init
|
|
|
30
30
|
from ...activations import ACT2FN
|
|
31
31
|
from ...cache_utils import Cache, DynamicCache
|
|
32
32
|
from ...generation import GenerationMixin
|
|
33
|
-
from ...integrations import use_kernel_forward_from_hub
|
|
33
|
+
from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
|
|
34
34
|
from ...masking_utils import create_causal_mask
|
|
35
35
|
from ...modeling_layers import GradientCheckpointingLayer
|
|
36
36
|
from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
|
|
@@ -38,7 +38,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
|
|
38
38
|
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
39
39
|
from ...processing_utils import Unpack
|
|
40
40
|
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
|
41
|
-
from ...utils.generic import OutputRecorder, check_model_inputs
|
|
41
|
+
from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
|
|
42
42
|
from .configuration_flex_olmo import FlexOlmoConfig
|
|
43
43
|
|
|
44
44
|
|
|
@@ -119,7 +119,7 @@ class FlexOlmoRotaryEmbedding(nn.Module):
|
|
|
119
119
|
position_ids_expanded = position_ids[:, None, :].float()
|
|
120
120
|
|
|
121
121
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
|
122
|
-
with
|
|
122
|
+
with maybe_autocast(device_type=device_type, enabled=False): # Force float32
|
|
123
123
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
|
124
124
|
emb = torch.cat((freqs, freqs), dim=-1)
|
|
125
125
|
cos = emb.cos() * self.attention_scaling
|
|
@@ -216,6 +216,7 @@ def rotate_half(x):
|
|
|
216
216
|
return torch.cat((-x2, x1), dim=-1)
|
|
217
217
|
|
|
218
218
|
|
|
219
|
+
@use_kernelized_func(apply_rotary_pos_emb)
|
|
219
220
|
class FlexOlmoAttention(nn.Module):
|
|
220
221
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
|
221
222
|
|
|
@@ -241,7 +242,6 @@ class FlexOlmoAttention(nn.Module):
|
|
|
241
242
|
self.o_proj = nn.Linear(
|
|
242
243
|
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
|
243
244
|
)
|
|
244
|
-
self.rotary_fn = apply_rotary_pos_emb
|
|
245
245
|
self.q_norm = FlexOlmoRMSNorm(config.num_attention_heads * self.head_dim, config.rms_norm_eps)
|
|
246
246
|
self.k_norm = FlexOlmoRMSNorm(config.num_key_value_heads * self.head_dim, config.rms_norm_eps)
|
|
247
247
|
|
|
@@ -252,7 +252,6 @@ class FlexOlmoAttention(nn.Module):
|
|
|
252
252
|
attention_mask: Optional[torch.Tensor],
|
|
253
253
|
past_key_values: Optional[Cache] = None,
|
|
254
254
|
cache_position: Optional[torch.LongTensor] = None,
|
|
255
|
-
position_ids: Optional[torch.LongTensor] = None,
|
|
256
255
|
**kwargs: Unpack[TransformersKwargs],
|
|
257
256
|
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
|
258
257
|
input_shape = hidden_states.shape[:-1]
|
|
@@ -541,7 +541,7 @@ class Florence2VisionBackbone(Florence2VisionPreTrainedModel):
|
|
|
541
541
|
# Initialize weights and apply final processing
|
|
542
542
|
self.post_init()
|
|
543
543
|
|
|
544
|
-
def forward(self, hidden_states: torch.Tensor):
|
|
544
|
+
def forward(self, hidden_states: torch.Tensor, **kwargs):
|
|
545
545
|
for conv, block in zip(self.convs, self.blocks):
|
|
546
546
|
hidden_states = conv(hidden_states)
|
|
547
547
|
for layer in block:
|
|
@@ -708,6 +708,7 @@ class Florence2Model(Florence2PreTrainedModel):
|
|
|
708
708
|
output_hidden_states: Optional[bool] = None,
|
|
709
709
|
return_dict: Optional[bool] = None,
|
|
710
710
|
cache_position: Optional[torch.LongTensor] = None,
|
|
711
|
+
**kwargs,
|
|
711
712
|
) -> Union[tuple, Florence2Seq2SeqModelOutput]:
|
|
712
713
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
713
714
|
output_hidden_states = (
|
|
@@ -1422,7 +1422,7 @@ class Florence2VisionBackbone(Florence2VisionPreTrainedModel):
|
|
|
1422
1422
|
# Initialize weights and apply final processing
|
|
1423
1423
|
self.post_init()
|
|
1424
1424
|
|
|
1425
|
-
def forward(self, hidden_states: torch.Tensor):
|
|
1425
|
+
def forward(self, hidden_states: torch.Tensor, **kwargs):
|
|
1426
1426
|
for conv, block in zip(self.convs, self.blocks):
|
|
1427
1427
|
hidden_states = conv(hidden_states)
|
|
1428
1428
|
for layer in block:
|
|
@@ -1551,6 +1551,7 @@ class Florence2Model(LlavaModel):
|
|
|
1551
1551
|
output_hidden_states: Optional[bool] = None,
|
|
1552
1552
|
return_dict: Optional[bool] = None,
|
|
1553
1553
|
cache_position: Optional[torch.LongTensor] = None,
|
|
1554
|
+
**kwargs,
|
|
1554
1555
|
) -> Union[tuple, Florence2Seq2SeqModelOutput]:
|
|
1555
1556
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
1556
1557
|
output_hidden_states = (
|
|
@@ -439,6 +439,7 @@ class FNetModel(FNetPreTrainedModel):
|
|
|
439
439
|
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
440
440
|
output_hidden_states: Optional[bool] = None,
|
|
441
441
|
return_dict: Optional[bool] = None,
|
|
442
|
+
**kwargs,
|
|
442
443
|
) -> Union[tuple, BaseModelOutput]:
|
|
443
444
|
output_hidden_states = (
|
|
444
445
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
@@ -540,6 +541,7 @@ class FNetForPreTraining(FNetPreTrainedModel):
|
|
|
540
541
|
next_sentence_label: Optional[torch.Tensor] = None,
|
|
541
542
|
output_hidden_states: Optional[bool] = None,
|
|
542
543
|
return_dict: Optional[bool] = None,
|
|
544
|
+
**kwargs,
|
|
543
545
|
) -> Union[tuple, FNetForPreTrainingOutput]:
|
|
544
546
|
r"""
|
|
545
547
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -632,6 +634,7 @@ class FNetForMaskedLM(FNetPreTrainedModel):
|
|
|
632
634
|
labels: Optional[torch.Tensor] = None,
|
|
633
635
|
output_hidden_states: Optional[bool] = None,
|
|
634
636
|
return_dict: Optional[bool] = None,
|
|
637
|
+
**kwargs,
|
|
635
638
|
) -> Union[tuple, MaskedLMOutput]:
|
|
636
639
|
r"""
|
|
637
640
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -783,6 +786,7 @@ class FNetForSequenceClassification(FNetPreTrainedModel):
|
|
|
783
786
|
labels: Optional[torch.Tensor] = None,
|
|
784
787
|
output_hidden_states: Optional[bool] = None,
|
|
785
788
|
return_dict: Optional[bool] = None,
|
|
789
|
+
**kwargs,
|
|
786
790
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
787
791
|
r"""
|
|
788
792
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -856,6 +860,7 @@ class FNetForMultipleChoice(FNetPreTrainedModel):
|
|
|
856
860
|
labels: Optional[torch.Tensor] = None,
|
|
857
861
|
output_hidden_states: Optional[bool] = None,
|
|
858
862
|
return_dict: Optional[bool] = None,
|
|
863
|
+
**kwargs,
|
|
859
864
|
) -> Union[tuple, MultipleChoiceModelOutput]:
|
|
860
865
|
r"""
|
|
861
866
|
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
|
|
@@ -950,6 +955,7 @@ class FNetForTokenClassification(FNetPreTrainedModel):
|
|
|
950
955
|
labels: Optional[torch.Tensor] = None,
|
|
951
956
|
output_hidden_states: Optional[bool] = None,
|
|
952
957
|
return_dict: Optional[bool] = None,
|
|
958
|
+
**kwargs,
|
|
953
959
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
954
960
|
r"""
|
|
955
961
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1008,6 +1014,7 @@ class FNetForQuestionAnswering(FNetPreTrainedModel):
|
|
|
1008
1014
|
end_positions: Optional[torch.Tensor] = None,
|
|
1009
1015
|
output_hidden_states: Optional[bool] = None,
|
|
1010
1016
|
return_dict: Optional[bool] = None,
|
|
1017
|
+
**kwargs,
|
|
1011
1018
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
1012
1019
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
1013
1020
|
|
|
@@ -628,6 +628,7 @@ class FocalNetModel(FocalNetPreTrainedModel):
|
|
|
628
628
|
bool_masked_pos: Optional[torch.BoolTensor] = None,
|
|
629
629
|
output_hidden_states: Optional[bool] = None,
|
|
630
630
|
return_dict: Optional[bool] = None,
|
|
631
|
+
**kwargs,
|
|
631
632
|
) -> Union[tuple, FocalNetModelOutput]:
|
|
632
633
|
r"""
|
|
633
634
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
|
|
@@ -710,6 +711,7 @@ class FocalNetForMaskedImageModeling(FocalNetPreTrainedModel):
|
|
|
710
711
|
bool_masked_pos: Optional[torch.BoolTensor] = None,
|
|
711
712
|
output_hidden_states: Optional[bool] = None,
|
|
712
713
|
return_dict: Optional[bool] = None,
|
|
714
|
+
**kwargs,
|
|
713
715
|
) -> Union[tuple, FocalNetMaskedImageModelingOutput]:
|
|
714
716
|
r"""
|
|
715
717
|
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
|
|
@@ -812,6 +814,7 @@ class FocalNetForImageClassification(FocalNetPreTrainedModel):
|
|
|
812
814
|
labels: Optional[torch.LongTensor] = None,
|
|
813
815
|
output_hidden_states: Optional[bool] = None,
|
|
814
816
|
return_dict: Optional[bool] = None,
|
|
817
|
+
**kwargs,
|
|
815
818
|
) -> Union[tuple, FocalNetImageClassifierOutput]:
|
|
816
819
|
r"""
|
|
817
820
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -871,6 +874,7 @@ class FocalNetBackbone(FocalNetPreTrainedModel, BackboneMixin):
|
|
|
871
874
|
pixel_values: torch.Tensor,
|
|
872
875
|
output_hidden_states: Optional[bool] = None,
|
|
873
876
|
return_dict: Optional[bool] = None,
|
|
877
|
+
**kwargs,
|
|
874
878
|
) -> BackboneOutput:
|
|
875
879
|
r"""
|
|
876
880
|
Examples:
|
|
@@ -843,6 +843,7 @@ class FSMTModel(PretrainedFSMTModel):
|
|
|
843
843
|
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
844
844
|
return_dict: Optional[bool] = None,
|
|
845
845
|
cache_position: Optional[torch.Tensor] = None,
|
|
846
|
+
**kwargs,
|
|
846
847
|
) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
|
|
847
848
|
r"""
|
|
848
849
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -980,6 +981,7 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel, GenerationMixin):
|
|
|
980
981
|
output_hidden_states: Optional[bool] = None,
|
|
981
982
|
return_dict: Optional[bool] = None,
|
|
982
983
|
cache_position: Optional[torch.Tensor] = None,
|
|
984
|
+
**kwargs,
|
|
983
985
|
) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
|
|
984
986
|
r"""
|
|
985
987
|
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
|
|
@@ -766,6 +766,7 @@ class FunnelBaseModel(FunnelPreTrainedModel):
|
|
|
766
766
|
output_attentions: Optional[bool] = None,
|
|
767
767
|
output_hidden_states: Optional[bool] = None,
|
|
768
768
|
return_dict: Optional[bool] = None,
|
|
769
|
+
**kwargs,
|
|
769
770
|
) -> Union[tuple, BaseModelOutput]:
|
|
770
771
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
771
772
|
output_hidden_states = (
|
|
@@ -832,6 +833,7 @@ class FunnelModel(FunnelPreTrainedModel):
|
|
|
832
833
|
output_attentions: Optional[bool] = None,
|
|
833
834
|
output_hidden_states: Optional[bool] = None,
|
|
834
835
|
return_dict: Optional[bool] = None,
|
|
836
|
+
**kwargs,
|
|
835
837
|
) -> Union[tuple, BaseModelOutput]:
|
|
836
838
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
837
839
|
output_hidden_states = (
|
|
@@ -923,6 +925,7 @@ class FunnelForPreTraining(FunnelPreTrainedModel):
|
|
|
923
925
|
output_attentions: Optional[bool] = None,
|
|
924
926
|
output_hidden_states: Optional[bool] = None,
|
|
925
927
|
return_dict: Optional[bool] = None,
|
|
928
|
+
**kwargs,
|
|
926
929
|
) -> Union[tuple, FunnelForPreTrainingOutput]:
|
|
927
930
|
r"""
|
|
928
931
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1012,6 +1015,7 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
|
|
|
1012
1015
|
output_attentions: Optional[bool] = None,
|
|
1013
1016
|
output_hidden_states: Optional[bool] = None,
|
|
1014
1017
|
return_dict: Optional[bool] = None,
|
|
1018
|
+
**kwargs,
|
|
1015
1019
|
) -> Union[tuple, MaskedLMOutput]:
|
|
1016
1020
|
r"""
|
|
1017
1021
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1079,6 +1083,7 @@ class FunnelForSequenceClassification(FunnelPreTrainedModel):
|
|
|
1079
1083
|
output_attentions: Optional[bool] = None,
|
|
1080
1084
|
output_hidden_states: Optional[bool] = None,
|
|
1081
1085
|
return_dict: Optional[bool] = None,
|
|
1086
|
+
**kwargs,
|
|
1082
1087
|
) -> Union[tuple, SequenceClassifierOutput]:
|
|
1083
1088
|
r"""
|
|
1084
1089
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1158,6 +1163,7 @@ class FunnelForMultipleChoice(FunnelPreTrainedModel):
|
|
|
1158
1163
|
output_attentions: Optional[bool] = None,
|
|
1159
1164
|
output_hidden_states: Optional[bool] = None,
|
|
1160
1165
|
return_dict: Optional[bool] = None,
|
|
1166
|
+
**kwargs,
|
|
1161
1167
|
) -> Union[tuple, MultipleChoiceModelOutput]:
|
|
1162
1168
|
r"""
|
|
1163
1169
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
|
@@ -1233,6 +1239,7 @@ class FunnelForTokenClassification(FunnelPreTrainedModel):
|
|
|
1233
1239
|
output_attentions: Optional[bool] = None,
|
|
1234
1240
|
output_hidden_states: Optional[bool] = None,
|
|
1235
1241
|
return_dict: Optional[bool] = None,
|
|
1242
|
+
**kwargs,
|
|
1236
1243
|
) -> Union[tuple, TokenClassifierOutput]:
|
|
1237
1244
|
r"""
|
|
1238
1245
|
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
@@ -1295,6 +1302,7 @@ class FunnelForQuestionAnswering(FunnelPreTrainedModel):
|
|
|
1295
1302
|
output_attentions: Optional[bool] = None,
|
|
1296
1303
|
output_hidden_states: Optional[bool] = None,
|
|
1297
1304
|
return_dict: Optional[bool] = None,
|
|
1305
|
+
**kwargs,
|
|
1298
1306
|
) -> Union[tuple, QuestionAnsweringModelOutput]:
|
|
1299
1307
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
|
1300
1308
|
|