transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +4 -11
- transformers/activations.py +2 -2
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +11 -2
- transformers/cli/serve.py +11 -8
- transformers/configuration_utils.py +1 -69
- transformers/conversion_mapping.py +146 -26
- transformers/convert_slow_tokenizer.py +6 -4
- transformers/core_model_loading.py +207 -118
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +7 -8
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +1 -2
- transformers/generation/continuous_batching/cache.py +40 -38
- transformers/generation/continuous_batching/cache_manager.py +3 -16
- transformers/generation/continuous_batching/continuous_api.py +94 -406
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +54 -17
- transformers/generation/continuous_batching/scheduler.py +77 -95
- transformers/generation/logits_process.py +10 -5
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/utils.py +75 -95
- transformers/image_processing_utils.py +0 -3
- transformers/image_processing_utils_fast.py +17 -18
- transformers/image_transforms.py +44 -13
- transformers/image_utils.py +0 -5
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/deepspeed.py +145 -3
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/finegrained_fp8.py +142 -7
- transformers/integrations/flash_attention.py +2 -7
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +47 -34
- transformers/integrations/peft.py +488 -176
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/masking_utils.py +153 -9
- transformers/modeling_flash_attention_utils.py +1 -2
- transformers/modeling_utils.py +359 -358
- transformers/models/__init__.py +6 -0
- transformers/models/afmoe/configuration_afmoe.py +14 -4
- transformers/models/afmoe/modeling_afmoe.py +8 -8
- transformers/models/afmoe/modular_afmoe.py +7 -7
- transformers/models/aimv2/configuration_aimv2.py +2 -7
- transformers/models/aimv2/modeling_aimv2.py +26 -24
- transformers/models/aimv2/modular_aimv2.py +8 -12
- transformers/models/albert/configuration_albert.py +8 -1
- transformers/models/albert/modeling_albert.py +3 -3
- transformers/models/align/configuration_align.py +8 -5
- transformers/models/align/modeling_align.py +22 -24
- transformers/models/altclip/configuration_altclip.py +4 -6
- transformers/models/altclip/modeling_altclip.py +30 -26
- transformers/models/apertus/configuration_apertus.py +5 -7
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/apertus/modular_apertus.py +8 -10
- transformers/models/arcee/configuration_arcee.py +5 -7
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/configuration_aria.py +11 -21
- transformers/models/aria/modeling_aria.py +39 -36
- transformers/models/aria/modular_aria.py +33 -39
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
- transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
- transformers/models/auto/auto_factory.py +8 -6
- transformers/models/auto/configuration_auto.py +22 -0
- transformers/models/auto/image_processing_auto.py +17 -13
- transformers/models/auto/modeling_auto.py +15 -0
- transformers/models/auto/processing_auto.py +9 -18
- transformers/models/auto/tokenization_auto.py +17 -15
- transformers/models/autoformer/modeling_autoformer.py +2 -1
- transformers/models/aya_vision/configuration_aya_vision.py +4 -0
- transformers/models/aya_vision/modeling_aya_vision.py +29 -62
- transformers/models/aya_vision/modular_aya_vision.py +20 -45
- transformers/models/bamba/configuration_bamba.py +17 -7
- transformers/models/bamba/modeling_bamba.py +23 -55
- transformers/models/bamba/modular_bamba.py +19 -54
- transformers/models/bark/configuration_bark.py +2 -1
- transformers/models/bark/modeling_bark.py +24 -10
- transformers/models/bart/configuration_bart.py +9 -4
- transformers/models/bart/modeling_bart.py +9 -12
- transformers/models/beit/configuration_beit.py +2 -4
- transformers/models/beit/image_processing_beit_fast.py +3 -3
- transformers/models/beit/modeling_beit.py +14 -9
- transformers/models/bert/configuration_bert.py +12 -1
- transformers/models/bert/modeling_bert.py +6 -30
- transformers/models/bert_generation/configuration_bert_generation.py +17 -1
- transformers/models/bert_generation/modeling_bert_generation.py +6 -6
- transformers/models/big_bird/configuration_big_bird.py +12 -8
- transformers/models/big_bird/modeling_big_bird.py +0 -15
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
- transformers/models/biogpt/configuration_biogpt.py +8 -1
- transformers/models/biogpt/modeling_biogpt.py +4 -8
- transformers/models/biogpt/modular_biogpt.py +1 -5
- transformers/models/bit/configuration_bit.py +2 -4
- transformers/models/bit/modeling_bit.py +6 -5
- transformers/models/bitnet/configuration_bitnet.py +5 -7
- transformers/models/bitnet/modeling_bitnet.py +3 -4
- transformers/models/bitnet/modular_bitnet.py +3 -4
- transformers/models/blenderbot/configuration_blenderbot.py +8 -4
- transformers/models/blenderbot/modeling_blenderbot.py +4 -4
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
- transformers/models/blip/configuration_blip.py +9 -9
- transformers/models/blip/modeling_blip.py +55 -37
- transformers/models/blip_2/configuration_blip_2.py +2 -1
- transformers/models/blip_2/modeling_blip_2.py +81 -56
- transformers/models/bloom/configuration_bloom.py +5 -1
- transformers/models/bloom/modeling_bloom.py +2 -1
- transformers/models/blt/configuration_blt.py +23 -12
- transformers/models/blt/modeling_blt.py +20 -14
- transformers/models/blt/modular_blt.py +70 -10
- transformers/models/bridgetower/configuration_bridgetower.py +7 -1
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
- transformers/models/bridgetower/modeling_bridgetower.py +29 -15
- transformers/models/bros/configuration_bros.py +24 -17
- transformers/models/camembert/configuration_camembert.py +8 -1
- transformers/models/camembert/modeling_camembert.py +6 -6
- transformers/models/canine/configuration_canine.py +4 -1
- transformers/models/chameleon/configuration_chameleon.py +5 -7
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
- transformers/models/chameleon/modeling_chameleon.py +82 -36
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
- transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
- transformers/models/clap/configuration_clap.py +4 -8
- transformers/models/clap/modeling_clap.py +21 -22
- transformers/models/clip/configuration_clip.py +4 -1
- transformers/models/clip/image_processing_clip_fast.py +9 -0
- transformers/models/clip/modeling_clip.py +25 -22
- transformers/models/clipseg/configuration_clipseg.py +4 -1
- transformers/models/clipseg/modeling_clipseg.py +27 -25
- transformers/models/clipseg/processing_clipseg.py +11 -3
- transformers/models/clvp/configuration_clvp.py +14 -2
- transformers/models/clvp/modeling_clvp.py +19 -30
- transformers/models/codegen/configuration_codegen.py +4 -3
- transformers/models/codegen/modeling_codegen.py +2 -1
- transformers/models/cohere/configuration_cohere.py +5 -7
- transformers/models/cohere/modeling_cohere.py +4 -4
- transformers/models/cohere/modular_cohere.py +3 -3
- transformers/models/cohere2/configuration_cohere2.py +6 -8
- transformers/models/cohere2/modeling_cohere2.py +4 -4
- transformers/models/cohere2/modular_cohere2.py +9 -11
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
- transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
- transformers/models/colqwen2/modeling_colqwen2.py +7 -6
- transformers/models/colqwen2/modular_colqwen2.py +7 -6
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
- transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
- transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
- transformers/models/convbert/configuration_convbert.py +11 -7
- transformers/models/convnext/configuration_convnext.py +2 -4
- transformers/models/convnext/image_processing_convnext_fast.py +2 -2
- transformers/models/convnext/modeling_convnext.py +7 -6
- transformers/models/convnextv2/configuration_convnextv2.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +7 -6
- transformers/models/cpmant/configuration_cpmant.py +4 -0
- transformers/models/csm/configuration_csm.py +9 -15
- transformers/models/csm/modeling_csm.py +3 -3
- transformers/models/ctrl/configuration_ctrl.py +16 -0
- transformers/models/ctrl/modeling_ctrl.py +13 -25
- transformers/models/cwm/configuration_cwm.py +5 -7
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/configuration_d_fine.py +10 -56
- transformers/models/d_fine/modeling_d_fine.py +728 -868
- transformers/models/d_fine/modular_d_fine.py +335 -412
- transformers/models/dab_detr/configuration_dab_detr.py +22 -48
- transformers/models/dab_detr/modeling_dab_detr.py +11 -7
- transformers/models/dac/modeling_dac.py +1 -1
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
- transformers/models/data2vec/configuration_data2vec_text.py +11 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
- transformers/models/data2vec/modeling_data2vec_text.py +6 -6
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
- transformers/models/dbrx/configuration_dbrx.py +11 -3
- transformers/models/dbrx/modeling_dbrx.py +6 -6
- transformers/models/dbrx/modular_dbrx.py +6 -6
- transformers/models/deberta/configuration_deberta.py +6 -0
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
- transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
- transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
- transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
- transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
- transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
- transformers/models/deit/modeling_deit.py +11 -7
- transformers/models/depth_anything/configuration_depth_anything.py +12 -42
- transformers/models/depth_anything/modeling_depth_anything.py +5 -3
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
- transformers/models/depth_pro/modeling_depth_pro.py +8 -4
- transformers/models/detr/configuration_detr.py +18 -49
- transformers/models/detr/image_processing_detr_fast.py +11 -11
- transformers/models/detr/modeling_detr.py +695 -734
- transformers/models/dia/configuration_dia.py +4 -7
- transformers/models/dia/generation_dia.py +8 -17
- transformers/models/dia/modeling_dia.py +7 -7
- transformers/models/dia/modular_dia.py +4 -4
- transformers/models/diffllama/configuration_diffllama.py +5 -7
- transformers/models/diffllama/modeling_diffllama.py +3 -8
- transformers/models/diffllama/modular_diffllama.py +2 -7
- transformers/models/dinat/configuration_dinat.py +2 -4
- transformers/models/dinat/modeling_dinat.py +7 -6
- transformers/models/dinov2/configuration_dinov2.py +2 -4
- transformers/models/dinov2/modeling_dinov2.py +9 -8
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
- transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
- transformers/models/distilbert/configuration_distilbert.py +8 -1
- transformers/models/distilbert/modeling_distilbert.py +3 -3
- transformers/models/doge/configuration_doge.py +17 -7
- transformers/models/doge/modeling_doge.py +4 -4
- transformers/models/doge/modular_doge.py +20 -10
- transformers/models/donut/image_processing_donut_fast.py +4 -4
- transformers/models/dots1/configuration_dots1.py +16 -7
- transformers/models/dots1/modeling_dots1.py +4 -4
- transformers/models/dpr/configuration_dpr.py +19 -1
- transformers/models/dpt/configuration_dpt.py +23 -65
- transformers/models/dpt/image_processing_dpt_fast.py +5 -5
- transformers/models/dpt/modeling_dpt.py +19 -15
- transformers/models/dpt/modular_dpt.py +4 -4
- transformers/models/edgetam/configuration_edgetam.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +53 -53
- transformers/models/edgetam/modular_edgetam.py +5 -7
- transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
- transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
- transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
- transformers/models/electra/configuration_electra.py +13 -2
- transformers/models/electra/modeling_electra.py +6 -6
- transformers/models/emu3/configuration_emu3.py +12 -10
- transformers/models/emu3/modeling_emu3.py +84 -47
- transformers/models/emu3/modular_emu3.py +77 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
- transformers/models/eomt/configuration_eomt.py +12 -13
- transformers/models/eomt/image_processing_eomt_fast.py +3 -3
- transformers/models/eomt/modeling_eomt.py +3 -3
- transformers/models/eomt/modular_eomt.py +17 -17
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -2
- transformers/models/ernie/modeling_ernie.py +6 -30
- transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
- transformers/models/esm/configuration_esm.py +9 -11
- transformers/models/esm/modeling_esm.py +3 -3
- transformers/models/esm/modeling_esmfold.py +1 -6
- transformers/models/esm/openfold_utils/protein.py +2 -3
- transformers/models/evolla/configuration_evolla.py +21 -8
- transformers/models/evolla/modeling_evolla.py +11 -7
- transformers/models/evolla/modular_evolla.py +5 -1
- transformers/models/exaone4/configuration_exaone4.py +8 -5
- transformers/models/exaone4/modeling_exaone4.py +4 -4
- transformers/models/exaone4/modular_exaone4.py +11 -8
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +9 -1
- transformers/models/falcon/modeling_falcon.py +3 -8
- transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
- transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
- transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
- transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
- transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
- transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
- transformers/models/flaubert/configuration_flaubert.py +10 -4
- transformers/models/flaubert/modeling_flaubert.py +1 -1
- transformers/models/flava/configuration_flava.py +4 -3
- transformers/models/flava/image_processing_flava_fast.py +4 -4
- transformers/models/flava/modeling_flava.py +36 -28
- transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
- transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
- transformers/models/florence2/configuration_florence2.py +4 -0
- transformers/models/florence2/modeling_florence2.py +57 -32
- transformers/models/florence2/modular_florence2.py +48 -26
- transformers/models/fnet/configuration_fnet.py +6 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -4
- transformers/models/focalnet/modeling_focalnet.py +10 -7
- transformers/models/fsmt/configuration_fsmt.py +12 -16
- transformers/models/funnel/configuration_funnel.py +8 -0
- transformers/models/fuyu/configuration_fuyu.py +5 -8
- transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
- transformers/models/fuyu/modeling_fuyu.py +24 -23
- transformers/models/gemma/configuration_gemma.py +5 -7
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/modular_gemma.py +5 -7
- transformers/models/gemma2/configuration_gemma2.py +5 -7
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +8 -10
- transformers/models/gemma3/configuration_gemma3.py +28 -22
- transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
- transformers/models/gemma3/modeling_gemma3.py +37 -33
- transformers/models/gemma3/modular_gemma3.py +46 -42
- transformers/models/gemma3n/configuration_gemma3n.py +35 -22
- transformers/models/gemma3n/modeling_gemma3n.py +86 -58
- transformers/models/gemma3n/modular_gemma3n.py +112 -75
- transformers/models/git/configuration_git.py +5 -7
- transformers/models/git/modeling_git.py +31 -41
- transformers/models/glm/configuration_glm.py +7 -9
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/configuration_glm4.py +7 -9
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm46v/configuration_glm46v.py +4 -0
- transformers/models/glm46v/image_processing_glm46v.py +5 -2
- transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
- transformers/models/glm46v/modeling_glm46v.py +91 -46
- transformers/models/glm46v/modular_glm46v.py +4 -0
- transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
- transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
- transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
- transformers/models/glm4v/configuration_glm4v.py +12 -8
- transformers/models/glm4v/image_processing_glm4v.py +5 -2
- transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
- transformers/models/glm4v/modeling_glm4v.py +120 -63
- transformers/models/glm4v/modular_glm4v.py +82 -50
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
- transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
- transformers/models/glm_image/configuration_glm_image.py +26 -20
- transformers/models/glm_image/image_processing_glm_image.py +1 -1
- transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
- transformers/models/glm_image/modeling_glm_image.py +337 -236
- transformers/models/glm_image/modular_glm_image.py +415 -255
- transformers/models/glm_image/processing_glm_image.py +65 -17
- transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/modeling_glmasr.py +34 -28
- transformers/models/glmasr/modular_glmasr.py +23 -11
- transformers/models/glpn/image_processing_glpn_fast.py +3 -3
- transformers/models/glpn/modeling_glpn.py +4 -2
- transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
- transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
- transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
- transformers/models/gpt2/configuration_gpt2.py +13 -1
- transformers/models/gpt2/modeling_gpt2.py +5 -5
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
- transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
- transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
- transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
- transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
- transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
- transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
- transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +3 -7
- transformers/models/granite/configuration_granite.py +5 -7
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granite_speech/modeling_granite_speech.py +63 -37
- transformers/models/granitemoe/configuration_granitemoe.py +5 -7
- transformers/models/granitemoe/modeling_granitemoe.py +4 -4
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
- transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
- transformers/models/groupvit/configuration_groupvit.py +4 -1
- transformers/models/groupvit/modeling_groupvit.py +29 -22
- transformers/models/helium/configuration_helium.py +5 -7
- transformers/models/helium/modeling_helium.py +4 -4
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
- transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
- transformers/models/hiera/configuration_hiera.py +2 -4
- transformers/models/hiera/modeling_hiera.py +11 -8
- transformers/models/hubert/configuration_hubert.py +4 -1
- transformers/models/hubert/modeling_hubert.py +7 -4
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
- transformers/models/ibert/configuration_ibert.py +4 -1
- transformers/models/idefics/configuration_idefics.py +5 -7
- transformers/models/idefics/modeling_idefics.py +3 -4
- transformers/models/idefics/vision.py +5 -4
- transformers/models/idefics2/configuration_idefics2.py +1 -2
- transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
- transformers/models/idefics2/modeling_idefics2.py +72 -50
- transformers/models/idefics3/configuration_idefics3.py +1 -3
- transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
- transformers/models/idefics3/modeling_idefics3.py +63 -40
- transformers/models/ijepa/modeling_ijepa.py +3 -3
- transformers/models/imagegpt/configuration_imagegpt.py +9 -1
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
- transformers/models/imagegpt/modeling_imagegpt.py +8 -4
- transformers/models/informer/modeling_informer.py +3 -3
- transformers/models/instructblip/configuration_instructblip.py +2 -1
- transformers/models/instructblip/modeling_instructblip.py +65 -39
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
- transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
- transformers/models/internvl/configuration_internvl.py +5 -0
- transformers/models/internvl/modeling_internvl.py +35 -55
- transformers/models/internvl/modular_internvl.py +26 -38
- transformers/models/internvl/video_processing_internvl.py +2 -2
- transformers/models/jais2/configuration_jais2.py +5 -7
- transformers/models/jais2/modeling_jais2.py +4 -4
- transformers/models/jamba/configuration_jamba.py +5 -7
- transformers/models/jamba/modeling_jamba.py +4 -4
- transformers/models/jamba/modular_jamba.py +3 -3
- transformers/models/janus/image_processing_janus.py +2 -2
- transformers/models/janus/image_processing_janus_fast.py +8 -8
- transformers/models/janus/modeling_janus.py +63 -146
- transformers/models/janus/modular_janus.py +62 -20
- transformers/models/jetmoe/configuration_jetmoe.py +6 -4
- transformers/models/jetmoe/modeling_jetmoe.py +3 -3
- transformers/models/jetmoe/modular_jetmoe.py +3 -3
- transformers/models/kosmos2/configuration_kosmos2.py +10 -8
- transformers/models/kosmos2/modeling_kosmos2.py +56 -34
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
- transformers/models/lasr/configuration_lasr.py +2 -4
- transformers/models/lasr/modeling_lasr.py +3 -3
- transformers/models/lasr/modular_lasr.py +3 -3
- transformers/models/layoutlm/configuration_layoutlm.py +14 -1
- transformers/models/layoutlm/modeling_layoutlm.py +3 -3
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
- transformers/models/led/configuration_led.py +7 -8
- transformers/models/levit/image_processing_levit_fast.py +4 -4
- transformers/models/lfm2/configuration_lfm2.py +5 -7
- transformers/models/lfm2/modeling_lfm2.py +4 -4
- transformers/models/lfm2/modular_lfm2.py +3 -3
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
- transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
- transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
- transformers/models/lightglue/modeling_lightglue.py +3 -3
- transformers/models/lightglue/modular_lightglue.py +3 -3
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
- transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
- transformers/models/lilt/configuration_lilt.py +6 -1
- transformers/models/llama/configuration_llama.py +5 -7
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama4/configuration_llama4.py +67 -47
- transformers/models/llama4/image_processing_llama4_fast.py +3 -3
- transformers/models/llama4/modeling_llama4.py +46 -44
- transformers/models/llava/configuration_llava.py +10 -0
- transformers/models/llava/image_processing_llava_fast.py +3 -3
- transformers/models/llava/modeling_llava.py +38 -65
- transformers/models/llava_next/configuration_llava_next.py +2 -1
- transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
- transformers/models/llava_next/modeling_llava_next.py +61 -60
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
- transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
- transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
- transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
- transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
- transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
- transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
- transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
- transformers/models/longformer/configuration_longformer.py +4 -1
- transformers/models/longt5/configuration_longt5.py +9 -6
- transformers/models/longt5/modeling_longt5.py +2 -1
- transformers/models/luke/configuration_luke.py +8 -1
- transformers/models/lw_detr/configuration_lw_detr.py +19 -31
- transformers/models/lw_detr/modeling_lw_detr.py +43 -44
- transformers/models/lw_detr/modular_lw_detr.py +36 -38
- transformers/models/lxmert/configuration_lxmert.py +16 -0
- transformers/models/m2m_100/configuration_m2m_100.py +7 -8
- transformers/models/m2m_100/modeling_m2m_100.py +3 -3
- transformers/models/mamba/configuration_mamba.py +5 -2
- transformers/models/mamba/modeling_mamba.py +18 -26
- transformers/models/mamba2/configuration_mamba2.py +5 -7
- transformers/models/mamba2/modeling_mamba2.py +22 -33
- transformers/models/marian/configuration_marian.py +10 -4
- transformers/models/marian/modeling_marian.py +4 -4
- transformers/models/markuplm/configuration_markuplm.py +4 -6
- transformers/models/markuplm/modeling_markuplm.py +3 -3
- transformers/models/mask2former/configuration_mask2former.py +12 -47
- transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
- transformers/models/mask2former/modeling_mask2former.py +18 -12
- transformers/models/maskformer/configuration_maskformer.py +14 -45
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
- transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
- transformers/models/maskformer/modeling_maskformer.py +15 -9
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
- transformers/models/mbart/configuration_mbart.py +9 -4
- transformers/models/mbart/modeling_mbart.py +9 -6
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
- transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
- transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
- transformers/models/mgp_str/modeling_mgp_str.py +4 -2
- transformers/models/mimi/configuration_mimi.py +4 -0
- transformers/models/mimi/modeling_mimi.py +40 -36
- transformers/models/minimax/configuration_minimax.py +8 -11
- transformers/models/minimax/modeling_minimax.py +5 -5
- transformers/models/minimax/modular_minimax.py +9 -12
- transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
- transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
- transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
- transformers/models/ministral/configuration_ministral.py +5 -7
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral/modular_ministral.py +5 -8
- transformers/models/ministral3/configuration_ministral3.py +4 -4
- transformers/models/ministral3/modeling_ministral3.py +4 -4
- transformers/models/ministral3/modular_ministral3.py +3 -3
- transformers/models/mistral/configuration_mistral.py +5 -7
- transformers/models/mistral/modeling_mistral.py +4 -4
- transformers/models/mistral/modular_mistral.py +3 -3
- transformers/models/mistral3/configuration_mistral3.py +4 -0
- transformers/models/mistral3/modeling_mistral3.py +36 -40
- transformers/models/mistral3/modular_mistral3.py +31 -32
- transformers/models/mixtral/configuration_mixtral.py +8 -11
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mlcd/modeling_mlcd.py +7 -5
- transformers/models/mlcd/modular_mlcd.py +7 -5
- transformers/models/mllama/configuration_mllama.py +5 -7
- transformers/models/mllama/image_processing_mllama_fast.py +6 -5
- transformers/models/mllama/modeling_mllama.py +19 -19
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
- transformers/models/mobilebert/configuration_mobilebert.py +4 -1
- transformers/models/mobilebert/modeling_mobilebert.py +3 -3
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
- transformers/models/mobilevit/modeling_mobilevit.py +4 -2
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
- transformers/models/modernbert/configuration_modernbert.py +46 -21
- transformers/models/modernbert/modeling_modernbert.py +146 -899
- transformers/models/modernbert/modular_modernbert.py +185 -908
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
- transformers/models/moonshine/configuration_moonshine.py +12 -7
- transformers/models/moonshine/modeling_moonshine.py +7 -7
- transformers/models/moonshine/modular_moonshine.py +19 -13
- transformers/models/moshi/configuration_moshi.py +28 -2
- transformers/models/moshi/modeling_moshi.py +4 -9
- transformers/models/mpnet/configuration_mpnet.py +6 -1
- transformers/models/mpt/configuration_mpt.py +16 -0
- transformers/models/mra/configuration_mra.py +8 -1
- transformers/models/mt5/configuration_mt5.py +9 -5
- transformers/models/mt5/modeling_mt5.py +5 -8
- transformers/models/musicgen/configuration_musicgen.py +12 -7
- transformers/models/musicgen/modeling_musicgen.py +6 -5
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
- transformers/models/mvp/configuration_mvp.py +8 -4
- transformers/models/mvp/modeling_mvp.py +6 -4
- transformers/models/nanochat/configuration_nanochat.py +5 -7
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nanochat/modular_nanochat.py +4 -4
- transformers/models/nemotron/configuration_nemotron.py +5 -7
- transformers/models/nemotron/modeling_nemotron.py +4 -14
- transformers/models/nllb/tokenization_nllb.py +7 -5
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
- transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
- transformers/models/nougat/image_processing_nougat_fast.py +8 -8
- transformers/models/nystromformer/configuration_nystromformer.py +8 -1
- transformers/models/olmo/configuration_olmo.py +5 -7
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +3 -3
- transformers/models/olmo2/configuration_olmo2.py +9 -11
- transformers/models/olmo2/modeling_olmo2.py +4 -4
- transformers/models/olmo2/modular_olmo2.py +7 -7
- transformers/models/olmo3/configuration_olmo3.py +10 -11
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmo3/modular_olmo3.py +13 -14
- transformers/models/olmoe/configuration_olmoe.py +5 -7
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/olmoe/modular_olmoe.py +3 -3
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
- transformers/models/oneformer/configuration_oneformer.py +9 -46
- transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
- transformers/models/oneformer/modeling_oneformer.py +14 -9
- transformers/models/openai/configuration_openai.py +16 -0
- transformers/models/opt/configuration_opt.py +6 -6
- transformers/models/opt/modeling_opt.py +5 -5
- transformers/models/ovis2/configuration_ovis2.py +4 -0
- transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
- transformers/models/ovis2/modeling_ovis2.py +58 -99
- transformers/models/ovis2/modular_ovis2.py +52 -13
- transformers/models/owlv2/configuration_owlv2.py +4 -1
- transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
- transformers/models/owlv2/modeling_owlv2.py +40 -27
- transformers/models/owlv2/modular_owlv2.py +5 -5
- transformers/models/owlvit/configuration_owlvit.py +4 -1
- transformers/models/owlvit/modeling_owlvit.py +40 -27
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
- transformers/models/paligemma/configuration_paligemma.py +4 -0
- transformers/models/paligemma/modeling_paligemma.py +30 -26
- transformers/models/parakeet/configuration_parakeet.py +2 -4
- transformers/models/parakeet/modeling_parakeet.py +3 -3
- transformers/models/parakeet/modular_parakeet.py +3 -3
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
- transformers/models/patchtst/modeling_patchtst.py +3 -3
- transformers/models/pe_audio/modeling_pe_audio.py +4 -4
- transformers/models/pe_audio/modular_pe_audio.py +1 -1
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
- transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
- transformers/models/pe_video/modeling_pe_video.py +36 -24
- transformers/models/pe_video/modular_pe_video.py +36 -23
- transformers/models/pegasus/configuration_pegasus.py +8 -5
- transformers/models/pegasus/modeling_pegasus.py +4 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
- transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
- transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
- transformers/models/perceiver/modeling_perceiver.py +17 -9
- transformers/models/perception_lm/modeling_perception_lm.py +26 -27
- transformers/models/perception_lm/modular_perception_lm.py +27 -25
- transformers/models/persimmon/configuration_persimmon.py +5 -7
- transformers/models/persimmon/modeling_persimmon.py +5 -5
- transformers/models/phi/configuration_phi.py +8 -6
- transformers/models/phi/modeling_phi.py +4 -4
- transformers/models/phi/modular_phi.py +3 -3
- transformers/models/phi3/configuration_phi3.py +9 -11
- transformers/models/phi3/modeling_phi3.py +4 -4
- transformers/models/phi3/modular_phi3.py +3 -3
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
- transformers/models/phimoe/configuration_phimoe.py +5 -7
- transformers/models/phimoe/modeling_phimoe.py +15 -39
- transformers/models/phimoe/modular_phimoe.py +12 -7
- transformers/models/pix2struct/configuration_pix2struct.py +12 -9
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
- transformers/models/pix2struct/modeling_pix2struct.py +14 -7
- transformers/models/pixio/configuration_pixio.py +2 -4
- transformers/models/pixio/modeling_pixio.py +9 -8
- transformers/models/pixio/modular_pixio.py +4 -2
- transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
- transformers/models/pixtral/modeling_pixtral.py +9 -12
- transformers/models/plbart/configuration_plbart.py +8 -5
- transformers/models/plbart/modeling_plbart.py +9 -7
- transformers/models/plbart/modular_plbart.py +1 -1
- transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
- transformers/models/pop2piano/configuration_pop2piano.py +7 -6
- transformers/models/pop2piano/modeling_pop2piano.py +2 -1
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
- transformers/models/prophetnet/configuration_prophetnet.py +11 -10
- transformers/models/prophetnet/modeling_prophetnet.py +12 -23
- transformers/models/pvt/image_processing_pvt.py +7 -7
- transformers/models/pvt/image_processing_pvt_fast.py +1 -1
- transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
- transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
- transformers/models/qwen2/configuration_qwen2.py +14 -4
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/modular_qwen2.py +3 -3
- transformers/models/qwen2/tokenization_qwen2.py +0 -4
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
- transformers/models/qwen3/configuration_qwen3.py +15 -5
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3/modular_qwen3.py +3 -3
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
- transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
- transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
- transformers/models/rag/configuration_rag.py +6 -6
- transformers/models/rag/modeling_rag.py +3 -3
- transformers/models/rag/retrieval_rag.py +1 -1
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
- transformers/models/reformer/configuration_reformer.py +7 -7
- transformers/models/rembert/configuration_rembert.py +8 -1
- transformers/models/rembert/modeling_rembert.py +0 -22
- transformers/models/resnet/configuration_resnet.py +2 -4
- transformers/models/resnet/modeling_resnet.py +6 -5
- transformers/models/roberta/configuration_roberta.py +11 -2
- transformers/models/roberta/modeling_roberta.py +6 -6
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
- transformers/models/roc_bert/configuration_roc_bert.py +8 -1
- transformers/models/roc_bert/modeling_roc_bert.py +6 -41
- transformers/models/roformer/configuration_roformer.py +13 -2
- transformers/models/roformer/modeling_roformer.py +0 -14
- transformers/models/rt_detr/configuration_rt_detr.py +8 -49
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
- transformers/models/rt_detr/modeling_rt_detr.py +578 -737
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
- transformers/models/rt_detr/modular_rt_detr.py +1508 -6
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
- transformers/models/rwkv/configuration_rwkv.py +2 -3
- transformers/models/rwkv/modeling_rwkv.py +0 -23
- transformers/models/sam/configuration_sam.py +2 -0
- transformers/models/sam/image_processing_sam_fast.py +4 -4
- transformers/models/sam/modeling_sam.py +13 -8
- transformers/models/sam/processing_sam.py +3 -3
- transformers/models/sam2/configuration_sam2.py +1 -1
- transformers/models/sam2/modeling_sam2.py +56 -52
- transformers/models/sam2/modular_sam2.py +47 -55
- transformers/models/sam2_video/modeling_sam2_video.py +50 -51
- transformers/models/sam2_video/modular_sam2_video.py +12 -10
- transformers/models/sam3/modeling_sam3.py +43 -47
- transformers/models/sam3/processing_sam3.py +8 -4
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
- transformers/models/sam3_video/modeling_sam3_video.py +27 -14
- transformers/models/sam_hq/configuration_sam_hq.py +2 -0
- transformers/models/sam_hq/modeling_sam_hq.py +13 -9
- transformers/models/sam_hq/modular_sam_hq.py +6 -6
- transformers/models/sam_hq/processing_sam_hq.py +7 -6
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
- transformers/models/seed_oss/configuration_seed_oss.py +7 -9
- transformers/models/seed_oss/modeling_seed_oss.py +4 -4
- transformers/models/seed_oss/modular_seed_oss.py +3 -3
- transformers/models/segformer/image_processing_segformer_fast.py +4 -4
- transformers/models/segformer/modeling_segformer.py +4 -2
- transformers/models/segformer/modular_segformer.py +3 -3
- transformers/models/seggpt/modeling_seggpt.py +20 -8
- transformers/models/sew/configuration_sew.py +4 -1
- transformers/models/sew/modeling_sew.py +9 -5
- transformers/models/sew/modular_sew.py +2 -1
- transformers/models/sew_d/configuration_sew_d.py +4 -1
- transformers/models/sew_d/modeling_sew_d.py +4 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
- transformers/models/siglip/configuration_siglip.py +4 -1
- transformers/models/siglip/modeling_siglip.py +27 -71
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
- transformers/models/siglip2/modeling_siglip2.py +37 -78
- transformers/models/siglip2/modular_siglip2.py +74 -25
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +6 -6
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smollm3/modular_smollm3.py +9 -9
- transformers/models/smolvlm/configuration_smolvlm.py +1 -3
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
- transformers/models/smolvlm/modeling_smolvlm.py +75 -46
- transformers/models/smolvlm/modular_smolvlm.py +36 -23
- transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
- transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
- transformers/models/speecht5/configuration_speecht5.py +7 -8
- transformers/models/splinter/configuration_splinter.py +6 -6
- transformers/models/splinter/modeling_splinter.py +8 -3
- transformers/models/squeezebert/configuration_squeezebert.py +14 -1
- transformers/models/stablelm/configuration_stablelm.py +8 -6
- transformers/models/stablelm/modeling_stablelm.py +5 -5
- transformers/models/starcoder2/configuration_starcoder2.py +11 -5
- transformers/models/starcoder2/modeling_starcoder2.py +5 -5
- transformers/models/starcoder2/modular_starcoder2.py +4 -4
- transformers/models/superglue/configuration_superglue.py +4 -0
- transformers/models/superglue/image_processing_superglue_fast.py +4 -3
- transformers/models/superglue/modeling_superglue.py +9 -4
- transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
- transformers/models/superpoint/modeling_superpoint.py +4 -2
- transformers/models/swin/configuration_swin.py +2 -4
- transformers/models/swin/modeling_swin.py +11 -8
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
- transformers/models/swin2sr/modeling_swin2sr.py +4 -2
- transformers/models/swinv2/configuration_swinv2.py +2 -4
- transformers/models/swinv2/modeling_swinv2.py +10 -7
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
- transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
- transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
- transformers/models/t5/configuration_t5.py +9 -8
- transformers/models/t5/modeling_t5.py +5 -8
- transformers/models/t5gemma/configuration_t5gemma.py +10 -25
- transformers/models/t5gemma/modeling_t5gemma.py +9 -9
- transformers/models/t5gemma/modular_t5gemma.py +11 -24
- transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
- transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
- transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
- transformers/models/table_transformer/configuration_table_transformer.py +18 -49
- transformers/models/table_transformer/modeling_table_transformer.py +27 -53
- transformers/models/tapas/configuration_tapas.py +12 -1
- transformers/models/tapas/modeling_tapas.py +1 -1
- transformers/models/tapas/tokenization_tapas.py +1 -0
- transformers/models/textnet/configuration_textnet.py +4 -6
- transformers/models/textnet/image_processing_textnet_fast.py +3 -3
- transformers/models/textnet/modeling_textnet.py +15 -14
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
- transformers/models/timesfm/modeling_timesfm.py +5 -6
- transformers/models/timesfm/modular_timesfm.py +5 -6
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
- transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
- transformers/models/trocr/configuration_trocr.py +11 -7
- transformers/models/trocr/modeling_trocr.py +4 -2
- transformers/models/tvp/configuration_tvp.py +10 -35
- transformers/models/tvp/image_processing_tvp_fast.py +6 -5
- transformers/models/tvp/modeling_tvp.py +1 -1
- transformers/models/udop/configuration_udop.py +16 -7
- transformers/models/udop/modeling_udop.py +10 -6
- transformers/models/umt5/configuration_umt5.py +8 -6
- transformers/models/umt5/modeling_umt5.py +7 -3
- transformers/models/unispeech/configuration_unispeech.py +4 -1
- transformers/models/unispeech/modeling_unispeech.py +7 -4
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
- transformers/models/upernet/configuration_upernet.py +8 -35
- transformers/models/upernet/modeling_upernet.py +1 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
- transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
- transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
- transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
- transformers/models/video_llava/configuration_video_llava.py +4 -0
- transformers/models/video_llava/modeling_video_llava.py +87 -89
- transformers/models/videomae/modeling_videomae.py +4 -5
- transformers/models/vilt/configuration_vilt.py +4 -1
- transformers/models/vilt/image_processing_vilt_fast.py +6 -6
- transformers/models/vilt/modeling_vilt.py +27 -12
- transformers/models/vipllava/configuration_vipllava.py +4 -0
- transformers/models/vipllava/modeling_vipllava.py +57 -31
- transformers/models/vipllava/modular_vipllava.py +50 -24
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
- transformers/models/visual_bert/configuration_visual_bert.py +6 -1
- transformers/models/vit/configuration_vit.py +2 -2
- transformers/models/vit/modeling_vit.py +7 -5
- transformers/models/vit_mae/modeling_vit_mae.py +11 -7
- transformers/models/vit_msn/modeling_vit_msn.py +11 -7
- transformers/models/vitdet/configuration_vitdet.py +2 -4
- transformers/models/vitdet/modeling_vitdet.py +2 -3
- transformers/models/vitmatte/configuration_vitmatte.py +6 -35
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
- transformers/models/vitmatte/modeling_vitmatte.py +1 -1
- transformers/models/vitpose/configuration_vitpose.py +6 -43
- transformers/models/vitpose/modeling_vitpose.py +5 -3
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
- transformers/models/vits/configuration_vits.py +4 -0
- transformers/models/vits/modeling_vits.py +9 -7
- transformers/models/vivit/modeling_vivit.py +4 -4
- transformers/models/vjepa2/modeling_vjepa2.py +9 -9
- transformers/models/voxtral/configuration_voxtral.py +0 -1
- transformers/models/voxtral/modeling_voxtral.py +25 -24
- transformers/models/voxtral/modular_voxtral.py +26 -20
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
- transformers/models/wavlm/configuration_wavlm.py +4 -1
- transformers/models/wavlm/modeling_wavlm.py +4 -1
- transformers/models/whisper/configuration_whisper.py +6 -4
- transformers/models/whisper/generation_whisper.py +0 -1
- transformers/models/whisper/modeling_whisper.py +3 -3
- transformers/models/x_clip/configuration_x_clip.py +4 -1
- transformers/models/x_clip/modeling_x_clip.py +26 -27
- transformers/models/xglm/configuration_xglm.py +9 -7
- transformers/models/xlm/configuration_xlm.py +10 -7
- transformers/models/xlm/modeling_xlm.py +1 -1
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
- transformers/models/xlnet/configuration_xlnet.py +3 -1
- transformers/models/xlstm/configuration_xlstm.py +5 -7
- transformers/models/xlstm/modeling_xlstm.py +0 -32
- transformers/models/xmod/configuration_xmod.py +11 -2
- transformers/models/xmod/modeling_xmod.py +13 -16
- transformers/models/yolos/image_processing_yolos_fast.py +25 -28
- transformers/models/yolos/modeling_yolos.py +7 -7
- transformers/models/yolos/modular_yolos.py +16 -16
- transformers/models/yoso/configuration_yoso.py +8 -1
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -7
- transformers/models/zamba/modeling_zamba.py +25 -56
- transformers/models/zamba2/configuration_zamba2.py +8 -13
- transformers/models/zamba2/modeling_zamba2.py +53 -78
- transformers/models/zamba2/modular_zamba2.py +36 -29
- transformers/models/zoedepth/configuration_zoedepth.py +17 -40
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
- transformers/models/zoedepth/modeling_zoedepth.py +5 -3
- transformers/pipelines/__init__.py +1 -61
- transformers/pipelines/any_to_any.py +1 -1
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +1 -1
- transformers/pipelines/image_text_to_text.py +1 -1
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/processing_utils.py +35 -44
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_torchao.py +0 -16
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +3 -28
- transformers/tokenization_mistral_common.py +9 -0
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +119 -219
- transformers/tokenization_utils_tokenizers.py +31 -2
- transformers/trainer.py +25 -33
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +411 -417
- transformers/utils/__init__.py +1 -4
- transformers/utils/auto_docstring.py +15 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/generic.py +69 -33
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +133 -105
- transformers/utils/quantization_config.py +0 -21
- transformers/video_processing_utils.py +5 -5
- transformers/video_utils.py +3 -1
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -189
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
transformers/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
|
|
19
19
|
# in the namespace without actually importing anything (and especially none of the backends).
|
|
20
20
|
|
|
21
|
-
__version__ = "5.0
|
|
21
|
+
__version__ = "5.1.0"
|
|
22
22
|
|
|
23
23
|
import importlib
|
|
24
24
|
import sys
|
|
@@ -147,7 +147,6 @@ _import_structure = {
|
|
|
147
147
|
"ImageSegmentationPipeline",
|
|
148
148
|
"ImageTextToTextPipeline",
|
|
149
149
|
"ImageToImagePipeline",
|
|
150
|
-
"ImageToTextPipeline",
|
|
151
150
|
"JsonPipelineDataFormat",
|
|
152
151
|
"KeypointMatchingPipeline",
|
|
153
152
|
"MaskGenerationPipeline",
|
|
@@ -157,14 +156,11 @@ _import_structure = {
|
|
|
157
156
|
"Pipeline",
|
|
158
157
|
"PipelineDataFormat",
|
|
159
158
|
"QuestionAnsweringPipeline",
|
|
160
|
-
"SummarizationPipeline",
|
|
161
159
|
"TableQuestionAnsweringPipeline",
|
|
162
|
-
"Text2TextGenerationPipeline",
|
|
163
160
|
"TextClassificationPipeline",
|
|
164
161
|
"TextGenerationPipeline",
|
|
165
162
|
"TextToAudioPipeline",
|
|
166
163
|
"TokenClassificationPipeline",
|
|
167
|
-
"TranslationPipeline",
|
|
168
164
|
"VideoClassificationPipeline",
|
|
169
165
|
"VisualQuestionAnsweringPipeline",
|
|
170
166
|
"ZeroShotAudioClassificationPipeline",
|
|
@@ -443,6 +439,7 @@ else:
|
|
|
443
439
|
_import_structure["modeling_flash_attention_utils"] = []
|
|
444
440
|
_import_structure["modeling_layers"] = ["GradientCheckpointingLayer"]
|
|
445
441
|
_import_structure["modeling_outputs"] = []
|
|
442
|
+
_import_structure["backbone_utils"] = ["BackboneConfigMixin", "BackboneMixin"]
|
|
446
443
|
_import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS", "dynamic_rope_update", "RopeParameters"]
|
|
447
444
|
_import_structure["modeling_utils"] = ["PreTrainedModel", "AttentionInterface"]
|
|
448
445
|
_import_structure["masking_utils"] = ["AttentionMaskInterface"]
|
|
@@ -471,6 +468,8 @@ else:
|
|
|
471
468
|
# Direct imports for type-checking
|
|
472
469
|
if TYPE_CHECKING:
|
|
473
470
|
# All modeling imports
|
|
471
|
+
# Models
|
|
472
|
+
from .backbone_utils import BackboneConfigMixin, BackboneMixin
|
|
474
473
|
from .cache_utils import Cache as Cache
|
|
475
474
|
from .cache_utils import DynamicCache as DynamicCache
|
|
476
475
|
from .cache_utils import DynamicLayer as DynamicLayer
|
|
@@ -613,8 +612,6 @@ if TYPE_CHECKING:
|
|
|
613
612
|
from .integrations.executorch import convert_and_export_with_cache as convert_and_export_with_cache
|
|
614
613
|
from .masking_utils import AttentionMaskInterface as AttentionMaskInterface
|
|
615
614
|
from .model_debugging_utils import model_addition_debugger_context as model_addition_debugger_context
|
|
616
|
-
|
|
617
|
-
# Models
|
|
618
615
|
from .modeling_layers import GradientCheckpointingLayer as GradientCheckpointingLayer
|
|
619
616
|
from .modeling_rope_utils import ROPE_INIT_FUNCTIONS as ROPE_INIT_FUNCTIONS
|
|
620
617
|
from .modeling_rope_utils import RopeParameters as RopeParameters
|
|
@@ -659,7 +656,6 @@ if TYPE_CHECKING:
|
|
|
659
656
|
from .pipelines import ImageSegmentationPipeline as ImageSegmentationPipeline
|
|
660
657
|
from .pipelines import ImageTextToTextPipeline as ImageTextToTextPipeline
|
|
661
658
|
from .pipelines import ImageToImagePipeline as ImageToImagePipeline
|
|
662
|
-
from .pipelines import ImageToTextPipeline as ImageToTextPipeline
|
|
663
659
|
from .pipelines import JsonPipelineDataFormat as JsonPipelineDataFormat
|
|
664
660
|
from .pipelines import KeypointMatchingPipeline as KeypointMatchingPipeline
|
|
665
661
|
from .pipelines import MaskGenerationPipeline as MaskGenerationPipeline
|
|
@@ -669,14 +665,11 @@ if TYPE_CHECKING:
|
|
|
669
665
|
from .pipelines import Pipeline as Pipeline
|
|
670
666
|
from .pipelines import PipelineDataFormat as PipelineDataFormat
|
|
671
667
|
from .pipelines import QuestionAnsweringPipeline as QuestionAnsweringPipeline
|
|
672
|
-
from .pipelines import SummarizationPipeline as SummarizationPipeline
|
|
673
668
|
from .pipelines import TableQuestionAnsweringPipeline as TableQuestionAnsweringPipeline
|
|
674
|
-
from .pipelines import Text2TextGenerationPipeline as Text2TextGenerationPipeline
|
|
675
669
|
from .pipelines import TextClassificationPipeline as TextClassificationPipeline
|
|
676
670
|
from .pipelines import TextGenerationPipeline as TextGenerationPipeline
|
|
677
671
|
from .pipelines import TextToAudioPipeline as TextToAudioPipeline
|
|
678
672
|
from .pipelines import TokenClassificationPipeline as TokenClassificationPipeline
|
|
679
|
-
from .pipelines import TranslationPipeline as TranslationPipeline
|
|
680
673
|
from .pipelines import VideoClassificationPipeline as VideoClassificationPipeline
|
|
681
674
|
from .pipelines import VisualQuestionAnsweringPipeline as VisualQuestionAnsweringPipeline
|
|
682
675
|
from .pipelines import ZeroShotAudioClassificationPipeline as ZeroShotAudioClassificationPipeline
|
transformers/activations.py
CHANGED
|
@@ -247,8 +247,8 @@ class XIELUActivation(nn.Module):
|
|
|
247
247
|
self.register_buffer("eps", torch.tensor(eps, dtype=dtype))
|
|
248
248
|
self.with_vector_loads = with_vector_loads
|
|
249
249
|
# Temporary until xIELU CUDA fully implemented
|
|
250
|
-
self._beta_scalar = float(
|
|
251
|
-
self._eps_scalar = float(
|
|
250
|
+
self._beta_scalar = float(beta)
|
|
251
|
+
self._eps_scalar = float(eps)
|
|
252
252
|
|
|
253
253
|
self._xielu_cuda_obj = None
|
|
254
254
|
try:
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
# Copyright 2026 The HuggingFace Inc. team.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Collection of utils to be used by backbones and their components."""
|
|
16
|
+
|
|
17
|
+
import enum
|
|
18
|
+
import inspect
|
|
19
|
+
|
|
20
|
+
from huggingface_hub import repo_exists
|
|
21
|
+
|
|
22
|
+
from .utils import logging
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
logger = logging.get_logger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BackboneType(enum.Enum):
|
|
29
|
+
TIMM = "timm"
|
|
30
|
+
TRANSFORMERS = "transformers"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class BackboneConfigMixin:
|
|
34
|
+
"""
|
|
35
|
+
A Mixin to support handling the `out_features` and `out_indices` attributes for the backbone configurations.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def set_output_features_output_indices(
|
|
39
|
+
self,
|
|
40
|
+
out_features: list | None,
|
|
41
|
+
out_indices: list | None,
|
|
42
|
+
):
|
|
43
|
+
"""
|
|
44
|
+
Sets output indices and features to new values and aligns them with the given `stage_names`.
|
|
45
|
+
If one of the inputs is not given, find the corresponding `out_features` or `out_indices`
|
|
46
|
+
for the given `stage_names`.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
out_features (`list[str]`, *optional*):
|
|
50
|
+
The names of the features for the backbone to output. Defaults to `config._out_features` if not provided.
|
|
51
|
+
out_indices (`list[int]` or `tuple[int]`, *optional*):
|
|
52
|
+
The indices of the features for the backbone to output. Defaults to `config._out_indices` if not provided.
|
|
53
|
+
"""
|
|
54
|
+
self._out_features = out_features
|
|
55
|
+
self._out_indices = list(out_indices) if isinstance(out_indices, tuple) else out_indices
|
|
56
|
+
|
|
57
|
+
# First verify that the out_features and out_indices are valid
|
|
58
|
+
self.verify_out_features_out_indices()
|
|
59
|
+
|
|
60
|
+
# Align output features with indices
|
|
61
|
+
out_features, out_indices = self._out_features, self._out_indices
|
|
62
|
+
if out_indices is None and out_features is None:
|
|
63
|
+
out_indices = [len(self.stage_names) - 1]
|
|
64
|
+
out_features = [self.stage_names[-1]]
|
|
65
|
+
elif out_indices is None and out_features is not None:
|
|
66
|
+
out_indices = [self.stage_names.index(layer) for layer in out_features]
|
|
67
|
+
elif out_features is None and out_indices is not None:
|
|
68
|
+
out_features = [self.stage_names[idx] for idx in out_indices]
|
|
69
|
+
|
|
70
|
+
# Update values and verify that the aligned out_features and out_indices are valid
|
|
71
|
+
self._out_features, self._out_indices = out_features, out_indices
|
|
72
|
+
self.verify_out_features_out_indices()
|
|
73
|
+
|
|
74
|
+
def verify_out_features_out_indices(self):
|
|
75
|
+
"""
|
|
76
|
+
Verify that out_indices and out_features are valid for the given stage_names.
|
|
77
|
+
"""
|
|
78
|
+
if self.stage_names is None:
|
|
79
|
+
raise ValueError("Stage_names must be set for transformers backbones")
|
|
80
|
+
|
|
81
|
+
if self._out_features is not None:
|
|
82
|
+
if not isinstance(self._out_features, (list,)):
|
|
83
|
+
raise ValueError(f"out_features must be a list got {type(self._out_features)}")
|
|
84
|
+
if any(feat not in self.stage_names for feat in self._out_features):
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"out_features must be a subset of stage_names: {self.stage_names} got {self._out_features}"
|
|
87
|
+
)
|
|
88
|
+
if len(self._out_features) != len(set(self._out_features)):
|
|
89
|
+
raise ValueError(f"out_features must not contain any duplicates, got {self._out_features}")
|
|
90
|
+
if self._out_features != (
|
|
91
|
+
sorted_feats := [feat for feat in self.stage_names if feat in self._out_features]
|
|
92
|
+
):
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"out_features must be in the same order as stage_names, expected {sorted_feats} got {self._out_features}"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if self._out_indices is not None:
|
|
98
|
+
if not isinstance(self._out_indices, list):
|
|
99
|
+
raise ValueError(f"out_indices must be a list, got {type(self._out_indices)}")
|
|
100
|
+
# Convert negative indices to their positive equivalent: [-1,] -> [len(stage_names) - 1,]
|
|
101
|
+
positive_indices = tuple(idx % len(self.stage_names) if idx < 0 else idx for idx in self._out_indices)
|
|
102
|
+
if any(idx for idx in positive_indices if idx not in range(len(self.stage_names))):
|
|
103
|
+
raise ValueError(
|
|
104
|
+
f"out_indices must be valid indices for stage_names {self.stage_names}, got {self._out_indices}"
|
|
105
|
+
)
|
|
106
|
+
if len(positive_indices) != len(set(positive_indices)):
|
|
107
|
+
msg = f"out_indices must not contain any duplicates, got {self._out_indices}"
|
|
108
|
+
msg += f"(equivalent to {positive_indices}))" if positive_indices != self._out_indices else ""
|
|
109
|
+
raise ValueError(msg)
|
|
110
|
+
if positive_indices != tuple(sorted(positive_indices)):
|
|
111
|
+
sorted_negative = [
|
|
112
|
+
idx for _, idx in sorted(zip(positive_indices, self._out_indices), key=lambda x: x[0])
|
|
113
|
+
]
|
|
114
|
+
raise ValueError(
|
|
115
|
+
f"out_indices must be in the same order as stage_names, expected {sorted_negative} got {self._out_indices}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if self._out_features is not None and self._out_indices is not None:
|
|
119
|
+
if len(self._out_features) != len(self._out_indices):
|
|
120
|
+
raise ValueError("out_features and out_indices should have the same length if both are set")
|
|
121
|
+
if self._out_features != [self.stage_names[idx] for idx in self._out_indices]:
|
|
122
|
+
raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def out_features(self):
|
|
126
|
+
return self._out_features
|
|
127
|
+
|
|
128
|
+
@out_features.setter
|
|
129
|
+
def out_features(self, out_features: list[str]):
|
|
130
|
+
"""
|
|
131
|
+
Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
|
|
132
|
+
"""
|
|
133
|
+
self.set_output_features_output_indices(out_features=out_features, out_indices=None)
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def out_indices(self):
|
|
137
|
+
return self._out_indices
|
|
138
|
+
|
|
139
|
+
@out_indices.setter
|
|
140
|
+
def out_indices(self, out_indices: tuple[int, ...] | list[int]):
|
|
141
|
+
"""
|
|
142
|
+
Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
|
|
143
|
+
"""
|
|
144
|
+
out_indices = list(out_indices) if out_indices is not None else out_indices
|
|
145
|
+
self.set_output_features_output_indices(out_features=None, out_indices=out_indices)
|
|
146
|
+
|
|
147
|
+
def to_dict(self):
|
|
148
|
+
"""
|
|
149
|
+
Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PreTrainedConfig` to
|
|
150
|
+
include the `out_features` and `out_indices` attributes.
|
|
151
|
+
"""
|
|
152
|
+
output = super().to_dict()
|
|
153
|
+
output["out_features"] = output.pop("_out_features", None)
|
|
154
|
+
output["out_indices"] = output.pop("_out_indices", None)
|
|
155
|
+
return output
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class BackboneMixin:
|
|
159
|
+
backbone_type: BackboneType | None = None
|
|
160
|
+
|
|
161
|
+
# Attribute to indicate if the backbone has attention and can return attention outputs.
|
|
162
|
+
# Should be set to `False` for conv-based models to be able to run `forward_with_filtered_kwargs`
|
|
163
|
+
has_attentions: bool = True
|
|
164
|
+
|
|
165
|
+
def __init__(self, *args, **kwargs) -> None:
|
|
166
|
+
"""
|
|
167
|
+
Method to initialize the backbone. This method is called by the constructor of the base class after the
|
|
168
|
+
pretrained model weights have been loaded.
|
|
169
|
+
"""
|
|
170
|
+
super().__init__(*args, **kwargs)
|
|
171
|
+
timm_backbone = kwargs.pop("timm_backbone", None)
|
|
172
|
+
if timm_backbone is not None:
|
|
173
|
+
self.backbone_type = BackboneType.TIMM
|
|
174
|
+
else:
|
|
175
|
+
self.backbone_type = BackboneType.TRANSFORMERS
|
|
176
|
+
|
|
177
|
+
if self.backbone_type == BackboneType.TIMM:
|
|
178
|
+
self._init_timm_backbone(backbone=timm_backbone)
|
|
179
|
+
elif self.backbone_type == BackboneType.TRANSFORMERS:
|
|
180
|
+
self._init_transformers_backbone()
|
|
181
|
+
else:
|
|
182
|
+
raise ValueError(f"backbone_type {self.backbone_type} not supported.")
|
|
183
|
+
|
|
184
|
+
def _init_timm_backbone(self, backbone) -> None:
|
|
185
|
+
"""
|
|
186
|
+
Initialize the backbone model from timm. The backbone must already be loaded to backbone
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
# These will disagree with the defaults for the transformers models e.g. for resnet50
|
|
190
|
+
# the transformer model has out_features = ['stem', 'stage1', 'stage2', 'stage3', 'stage4']
|
|
191
|
+
# the timm model has out_features = ['act', 'layer1', 'layer2', 'layer3', 'layer4']
|
|
192
|
+
self.stage_names = [stage["module"] for stage in backbone.feature_info.info]
|
|
193
|
+
self.num_features = [stage["num_chs"] for stage in backbone.feature_info.info]
|
|
194
|
+
|
|
195
|
+
self.config._out_indices = list(backbone.feature_info.out_indices)
|
|
196
|
+
self.config._out_features = backbone.feature_info.module_name()
|
|
197
|
+
self.config.stage_names = self.stage_names
|
|
198
|
+
|
|
199
|
+
# We verify the out indices and out features are valid
|
|
200
|
+
self.config.verify_out_features_out_indices()
|
|
201
|
+
|
|
202
|
+
def _init_transformers_backbone(self) -> None:
|
|
203
|
+
self.stage_names = self.config.stage_names
|
|
204
|
+
self.config.verify_out_features_out_indices()
|
|
205
|
+
# Number of channels for each stage. This is set in the transformer backbone model init
|
|
206
|
+
self.num_features = None
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def out_features(self):
|
|
210
|
+
return self.config._out_features
|
|
211
|
+
|
|
212
|
+
@out_features.setter
|
|
213
|
+
def out_features(self, out_features: list[str]):
|
|
214
|
+
"""
|
|
215
|
+
Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
|
|
216
|
+
"""
|
|
217
|
+
self.config.out_features = out_features
|
|
218
|
+
|
|
219
|
+
@property
|
|
220
|
+
def out_indices(self):
|
|
221
|
+
return self.config._out_indices
|
|
222
|
+
|
|
223
|
+
@out_indices.setter
|
|
224
|
+
def out_indices(self, out_indices: tuple[int] | list[int]):
|
|
225
|
+
"""
|
|
226
|
+
Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
|
|
227
|
+
"""
|
|
228
|
+
self.config.out_indices = out_indices
|
|
229
|
+
|
|
230
|
+
@property
|
|
231
|
+
def out_feature_channels(self):
|
|
232
|
+
# the current backbones will output the number of channels for each stage
|
|
233
|
+
# even if that stage is not in the out_features list.
|
|
234
|
+
return {stage: self.num_features[i] for i, stage in enumerate(self.stage_names)}
|
|
235
|
+
|
|
236
|
+
@property
|
|
237
|
+
def channels(self):
|
|
238
|
+
return [self.out_feature_channels[name] for name in self.out_features]
|
|
239
|
+
|
|
240
|
+
def forward_with_filtered_kwargs(self, *args, **kwargs):
|
|
241
|
+
if not self.has_attentions:
|
|
242
|
+
kwargs.pop("output_attentions", None)
|
|
243
|
+
if self.backbone_type == BackboneType.TIMM:
|
|
244
|
+
signature = dict(inspect.signature(self.forward).parameters)
|
|
245
|
+
kwargs = {k: v for k, v in kwargs.items() if k in signature}
|
|
246
|
+
return self(*args, **kwargs)
|
|
247
|
+
|
|
248
|
+
def forward(
|
|
249
|
+
self,
|
|
250
|
+
pixel_values,
|
|
251
|
+
output_hidden_states: bool | None = None,
|
|
252
|
+
output_attentions: bool | None = None,
|
|
253
|
+
return_dict: bool | None = None,
|
|
254
|
+
):
|
|
255
|
+
raise NotImplementedError("This method should be implemented by the derived class.")
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def consolidate_backbone_kwargs_to_config(
|
|
259
|
+
backbone_config,
|
|
260
|
+
default_backbone: str | None = None,
|
|
261
|
+
default_config_type: str | None = None,
|
|
262
|
+
default_config_kwargs: dict | None = None,
|
|
263
|
+
timm_default_kwargs: dict | None = None,
|
|
264
|
+
**kwargs,
|
|
265
|
+
):
|
|
266
|
+
# Lazy import to avoid circular import issues. Can be imported properly
|
|
267
|
+
# after deleting ref to `BackboneMixin` in `utils/backbone_utils.py`
|
|
268
|
+
from .configuration_utils import PreTrainedConfig
|
|
269
|
+
from .models.auto import CONFIG_MAPPING
|
|
270
|
+
|
|
271
|
+
use_timm_backbone = kwargs.pop("use_timm_backbone", True)
|
|
272
|
+
backbone_kwargs = kwargs.pop("backbone_kwargs", {})
|
|
273
|
+
backbone = kwargs.pop("backbone") if kwargs.get("backbone") is not None else default_backbone
|
|
274
|
+
kwargs.pop("use_pretrained_backbone", None)
|
|
275
|
+
|
|
276
|
+
# Init timm backbone with hardcoded values for BC. If everything is set to `None` and there is
|
|
277
|
+
# a default timm config, we use it to init the backbone.
|
|
278
|
+
if (
|
|
279
|
+
timm_default_kwargs is not None
|
|
280
|
+
and use_timm_backbone
|
|
281
|
+
and backbone is not None
|
|
282
|
+
and backbone_config is None
|
|
283
|
+
and not backbone_kwargs
|
|
284
|
+
):
|
|
285
|
+
backbone_config = CONFIG_MAPPING["timm_backbone"](backbone=backbone, **timm_default_kwargs)
|
|
286
|
+
elif backbone is not None and backbone_config is None:
|
|
287
|
+
if repo_exists(backbone):
|
|
288
|
+
config_dict, _ = PreTrainedConfig.get_config_dict(backbone)
|
|
289
|
+
config_class = CONFIG_MAPPING[config_dict["model_type"]]
|
|
290
|
+
config_dict.update(backbone_kwargs)
|
|
291
|
+
backbone_config = config_class(**config_dict)
|
|
292
|
+
else:
|
|
293
|
+
backbone_config = CONFIG_MAPPING["timm_backbone"](backbone=backbone, **backbone_kwargs)
|
|
294
|
+
elif backbone_config is None and default_config_type is not None:
|
|
295
|
+
logger.info(
|
|
296
|
+
f"`backbone_config` is `None`. Initializing the config with the default `{default_config_type}` vision config."
|
|
297
|
+
)
|
|
298
|
+
default_config_kwargs = default_config_kwargs or {}
|
|
299
|
+
backbone_config = CONFIG_MAPPING[default_config_type](**default_config_kwargs)
|
|
300
|
+
elif isinstance(backbone_config, dict):
|
|
301
|
+
backbone_model_type = backbone_config.get("model_type")
|
|
302
|
+
config_class = CONFIG_MAPPING[backbone_model_type]
|
|
303
|
+
backbone_config = config_class.from_dict(backbone_config)
|
|
304
|
+
|
|
305
|
+
return backbone_config, kwargs
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def load_backbone(config):
|
|
309
|
+
"""
|
|
310
|
+
Loads the backbone model from a config object.
|
|
311
|
+
|
|
312
|
+
If the config is from the backbone model itself, then we return a backbone model with randomly initialized
|
|
313
|
+
weights.
|
|
314
|
+
|
|
315
|
+
If the config is from the parent model of the backbone model itself, then we load the pretrained backbone weights
|
|
316
|
+
if specified.
|
|
317
|
+
"""
|
|
318
|
+
from transformers import AutoBackbone
|
|
319
|
+
|
|
320
|
+
backbone_config = getattr(config, "backbone_config", None)
|
|
321
|
+
|
|
322
|
+
if backbone_config is None:
|
|
323
|
+
backbone = AutoBackbone.from_config(config=config)
|
|
324
|
+
else:
|
|
325
|
+
backbone = AutoBackbone.from_config(config=backbone_config)
|
|
326
|
+
return backbone
|
transformers/cache_utils.py
CHANGED
|
@@ -7,6 +7,7 @@ import torch
|
|
|
7
7
|
from .configuration_utils import PreTrainedConfig
|
|
8
8
|
from .utils import (
|
|
9
9
|
is_hqq_available,
|
|
10
|
+
is_optimum_quanto_available,
|
|
10
11
|
is_quanto_greater,
|
|
11
12
|
is_torch_greater_or_equal,
|
|
12
13
|
is_torchdynamo_compiling,
|
|
@@ -584,7 +585,12 @@ class QuantoQuantizedLayer(QuantizedLayer):
|
|
|
584
585
|
)
|
|
585
586
|
|
|
586
587
|
# We need to import quanto here to avoid circular imports due to optimum/quanto/models/transformers_models.py
|
|
587
|
-
if
|
|
588
|
+
if not is_optimum_quanto_available():
|
|
589
|
+
raise ImportError(
|
|
590
|
+
"You need to install optimum-quanto in order to use KV cache quantization with optimum-quanto "
|
|
591
|
+
"backend. Please install it via with `pip install optimum-quanto`"
|
|
592
|
+
)
|
|
593
|
+
elif is_quanto_greater("0.2.5", accept_dev=True):
|
|
588
594
|
from optimum.quanto import MaxOptimizer, qint2, qint4
|
|
589
595
|
else:
|
|
590
596
|
raise ImportError(
|
|
@@ -634,7 +640,10 @@ class HQQQuantizedLayer(QuantizedLayer):
|
|
|
634
640
|
)
|
|
635
641
|
|
|
636
642
|
if not is_hqq_available():
|
|
637
|
-
raise ImportError(
|
|
643
|
+
raise ImportError(
|
|
644
|
+
"You need to install `HQQ` in order to use KV cache quantization with HQQ backend. "
|
|
645
|
+
"Please install it via with `pip install hqq`"
|
|
646
|
+
)
|
|
638
647
|
|
|
639
648
|
if self.nbits not in [1, 2, 3, 4, 8]:
|
|
640
649
|
raise ValueError(
|
transformers/cli/serve.py
CHANGED
|
@@ -839,6 +839,17 @@ class Serve:
|
|
|
839
839
|
for result in self.running_continuous_batching_manager.request_id_iter(request_id):
|
|
840
840
|
n_tokens_generated += 1
|
|
841
841
|
|
|
842
|
+
# Always yield the token content (even for the final FINISHED token)
|
|
843
|
+
if result.generated_tokens:
|
|
844
|
+
token_id = result.generated_tokens[-1]
|
|
845
|
+
yield self.build_chat_completion_chunk(
|
|
846
|
+
request_id=request_id,
|
|
847
|
+
content=token_id,
|
|
848
|
+
model=model_id_and_revision,
|
|
849
|
+
decode_stream=decode_stream,
|
|
850
|
+
tokenizer=tokenizer,
|
|
851
|
+
)
|
|
852
|
+
|
|
842
853
|
if result.status == RequestStatus.FINISHED:
|
|
843
854
|
generated_all_tokens = n_tokens_generated >= generation_config.max_new_tokens
|
|
844
855
|
|
|
@@ -855,14 +866,6 @@ class Serve:
|
|
|
855
866
|
model=model_id_and_revision,
|
|
856
867
|
)
|
|
857
868
|
break
|
|
858
|
-
else:
|
|
859
|
-
yield self.build_chat_completion_chunk(
|
|
860
|
-
request_id=request_id,
|
|
861
|
-
content=result.generated_tokens[-1],
|
|
862
|
-
model=model_id_and_revision,
|
|
863
|
-
decode_stream=decode_stream,
|
|
864
|
-
tokenizer=tokenizer,
|
|
865
|
-
)
|
|
866
869
|
|
|
867
870
|
except Exception as e:
|
|
868
871
|
logger.error(str(e))
|
|
@@ -114,16 +114,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
114
114
|
Whether or not the model should return a [`~transformers.utils.ModelOutput`] instead of a plain tuple.
|
|
115
115
|
is_encoder_decoder (`bool`, *optional*, defaults to `False`):
|
|
116
116
|
Whether the model is used as an encoder/decoder or not.
|
|
117
|
-
is_decoder (`bool`, *optional*, defaults to `False`):
|
|
118
|
-
Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on
|
|
119
|
-
decoder-only or encoder-only architectures.
|
|
120
|
-
cross_attention_hidden_size (`bool`, *optional*):
|
|
121
|
-
The hidden size of the cross-attention layer in case the model is used as a decoder in an encoder-decoder
|
|
122
|
-
setting and the cross-attention hidden dimension differs from `self.config.hidden_size`.
|
|
123
|
-
add_cross_attention (`bool`, *optional*, defaults to `False`):
|
|
124
|
-
Whether cross-attention layers should be added to the model. Note, this option is only relevant for models
|
|
125
|
-
that can be used as decoder models within the [`EncoderDecoderModel`] class, which consists of all models
|
|
126
|
-
in `AUTO_MODELS_FOR_CAUSAL_LM`.
|
|
127
117
|
chunk_size_feed_forward (`int`, *optional*, defaults to `0`):
|
|
128
118
|
The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means that
|
|
129
119
|
the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes `n` <
|
|
@@ -134,43 +124,18 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
134
124
|
|
|
135
125
|
architectures (`list[str]`, *optional*):
|
|
136
126
|
Model architectures that can be used with the model pretrained weights.
|
|
137
|
-
finetuning_task (`str`, *optional*):
|
|
138
|
-
Name of the task used to fine-tune the model.
|
|
139
127
|
id2label (`dict[int, str]`, *optional*):
|
|
140
128
|
A map from index (for instance prediction index, or target index) to label.
|
|
141
129
|
label2id (`dict[str, int]`, *optional*):
|
|
142
130
|
A map from label to index for the model.
|
|
143
131
|
num_labels (`int`, *optional*):
|
|
144
132
|
Number of labels to use in the last layer added to the model, typically for a classification task.
|
|
145
|
-
task_specific_params (`dict[str, Any]`, *optional*):
|
|
146
|
-
Additional keyword arguments to store for the current task.
|
|
147
133
|
problem_type (`str`, *optional*):
|
|
148
134
|
Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`,
|
|
149
135
|
`"single_label_classification"` or `"multi_label_classification"`.
|
|
150
136
|
|
|
151
|
-
> Parameters linked to the tokenizer
|
|
152
|
-
|
|
153
|
-
tokenizer_class (`str`, *optional*):
|
|
154
|
-
The name of the associated tokenizer class to use (if none is set, will use the tokenizer associated to the
|
|
155
|
-
model by default).
|
|
156
|
-
prefix (`str`, *optional*):
|
|
157
|
-
A specific prompt that should be added at the beginning of each text before calling the model.
|
|
158
|
-
bos_token_id (`int`, *optional*):
|
|
159
|
-
The id of the _beginning-of-stream_ token.
|
|
160
|
-
pad_token_id (`int`, *optional*):
|
|
161
|
-
The id of the _padding_ token.
|
|
162
|
-
eos_token_id (`int`, *optional*):
|
|
163
|
-
The id of the _end-of-stream_ token.
|
|
164
|
-
decoder_start_token_id (`int`, *optional*):
|
|
165
|
-
If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
|
|
166
|
-
sep_token_id (`int`, *optional*):
|
|
167
|
-
The id of the _separation_ token.
|
|
168
|
-
|
|
169
137
|
> PyTorch specific parameters
|
|
170
138
|
|
|
171
|
-
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
|
|
172
|
-
Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
|
|
173
|
-
model has a output word embedding layer.
|
|
174
139
|
dtype (`str`, *optional*):
|
|
175
140
|
The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
|
|
176
141
|
(which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
|
|
@@ -207,28 +172,14 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
207
172
|
return_dict: bool = True,
|
|
208
173
|
dtype: Union[str, "torch.dtype"] | None = None,
|
|
209
174
|
# Common arguments
|
|
210
|
-
tie_word_embeddings: bool = True,
|
|
211
175
|
chunk_size_feed_forward: int = 0,
|
|
212
176
|
is_encoder_decoder: bool = False,
|
|
213
|
-
is_decoder: bool = False,
|
|
214
|
-
cross_attention_hidden_size: int | None = None,
|
|
215
|
-
add_cross_attention: bool = False,
|
|
216
177
|
# Fine-tuning task arguments
|
|
217
178
|
architectures: list[str] | None = None,
|
|
218
|
-
finetuning_task: str | None = None,
|
|
219
179
|
id2label: dict[int, str] | None = None,
|
|
220
180
|
label2id: dict[str, int] | None = None,
|
|
221
181
|
num_labels: int | None = None,
|
|
222
|
-
task_specific_params: dict[str, Any] | None = None,
|
|
223
182
|
problem_type: str | None = None,
|
|
224
|
-
# Tokenizer kwargs
|
|
225
|
-
tokenizer_class: str | None = None,
|
|
226
|
-
prefix: str | None = None,
|
|
227
|
-
bos_token_id: int | None = None,
|
|
228
|
-
pad_token_id: int | None = None,
|
|
229
|
-
eos_token_id: int | None = None,
|
|
230
|
-
sep_token_id: int | None = None,
|
|
231
|
-
decoder_start_token_id: int | None = None,
|
|
232
183
|
**kwargs,
|
|
233
184
|
):
|
|
234
185
|
# Validation for some arguments
|
|
@@ -276,25 +227,15 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
276
227
|
self._output_attentions = output_attentions # has public property
|
|
277
228
|
|
|
278
229
|
# Less common kwargs, only used by some models
|
|
279
|
-
if "tie_encoder_decoder" in kwargs:
|
|
280
|
-
tie_encoder_decoder = kwargs.pop("tie_encoder_decoder")
|
|
281
|
-
tie_word_embeddings = tie_encoder_decoder or tie_word_embeddings
|
|
282
|
-
|
|
283
|
-
self.tie_word_embeddings = tie_word_embeddings
|
|
284
230
|
self.chunk_size_feed_forward = chunk_size_feed_forward
|
|
285
231
|
|
|
286
232
|
# Encoder-decoder models attributes
|
|
287
233
|
self.is_encoder_decoder = is_encoder_decoder
|
|
288
|
-
self.is_decoder = is_decoder # used in encoder-decoder models to differentiate encoder from decoder
|
|
289
|
-
self.cross_attention_hidden_size = cross_attention_hidden_size
|
|
290
|
-
self.add_cross_attention = add_cross_attention
|
|
291
234
|
|
|
292
235
|
# Fine-tuning task attributes
|
|
293
236
|
self.architectures = architectures
|
|
294
|
-
self.finetuning_task = finetuning_task
|
|
295
237
|
self.id2label = id2label
|
|
296
238
|
self.label2id = label2id
|
|
297
|
-
self.task_specific_params = task_specific_params
|
|
298
239
|
self.problem_type = problem_type
|
|
299
240
|
|
|
300
241
|
if self.id2label is None:
|
|
@@ -303,15 +244,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
303
244
|
# Keys are always strings in JSON so convert ids to int here.
|
|
304
245
|
self.id2label = {int(key): value for key, value in self.id2label.items()}
|
|
305
246
|
|
|
306
|
-
# Tokenizer attributes
|
|
307
|
-
self.tokenizer_class = tokenizer_class
|
|
308
|
-
self.prefix = prefix
|
|
309
|
-
self.bos_token_id = bos_token_id
|
|
310
|
-
self.pad_token_id = pad_token_id
|
|
311
|
-
self.eos_token_id = eos_token_id
|
|
312
|
-
self.sep_token_id = sep_token_id
|
|
313
|
-
self.decoder_start_token_id = decoder_start_token_id
|
|
314
|
-
|
|
315
247
|
# Parameters for sequence generation saved in the config are popped instead of loading them.
|
|
316
248
|
for parameter_name in GenerationConfig._get_default_generation_params().keys():
|
|
317
249
|
kwargs.pop(parameter_name, None)
|
|
@@ -321,7 +253,7 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
|
|
|
321
253
|
self._commit_hash = kwargs.pop("_commit_hash", None)
|
|
322
254
|
|
|
323
255
|
# Attention implementation to use, if relevant (it sets it recursively on sub-configs)
|
|
324
|
-
self._attn_implementation = kwargs.pop("attn_implementation", None)
|
|
256
|
+
self._attn_implementation: str | None = kwargs.pop("attn_implementation", None)
|
|
325
257
|
|
|
326
258
|
# Experts implementation to use, if relevant (it sets it recursively on sub-configs)
|
|
327
259
|
self._experts_implementation = kwargs.pop("experts_implementation", None)
|