transformers 5.0.0__py3-none-any.whl → 5.0.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +36 -55
- transformers/activations.py +1 -1
- transformers/audio_utils.py +33 -32
- transformers/cache_utils.py +139 -32
- transformers/cli/chat.py +3 -3
- transformers/cli/serve.py +19 -49
- transformers/cli/transformers.py +1 -2
- transformers/configuration_utils.py +155 -129
- transformers/conversion_mapping.py +22 -158
- transformers/convert_slow_tokenizer.py +17 -227
- transformers/core_model_loading.py +185 -528
- transformers/data/data_collator.py +4 -12
- transformers/data/processors/glue.py +1 -0
- transformers/data/processors/utils.py +1 -0
- transformers/data/processors/xnli.py +1 -0
- transformers/dependency_versions_check.py +1 -0
- transformers/dependency_versions_table.py +7 -5
- transformers/distributed/configuration_utils.py +2 -1
- transformers/dynamic_module_utils.py +25 -24
- transformers/feature_extraction_sequence_utils.py +23 -19
- transformers/feature_extraction_utils.py +33 -64
- transformers/file_utils.py +1 -0
- transformers/generation/__init__.py +1 -11
- transformers/generation/candidate_generator.py +33 -80
- transformers/generation/configuration_utils.py +133 -189
- transformers/generation/continuous_batching/__init__.py +1 -4
- transformers/generation/continuous_batching/cache.py +25 -83
- transformers/generation/continuous_batching/cache_manager.py +45 -155
- transformers/generation/continuous_batching/continuous_api.py +147 -270
- transformers/generation/continuous_batching/requests.py +3 -51
- transformers/generation/continuous_batching/scheduler.py +105 -160
- transformers/generation/logits_process.py +128 -0
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/streamers.py +1 -0
- transformers/generation/utils.py +123 -122
- transformers/generation/watermarking.py +6 -8
- transformers/hf_argparser.py +13 -9
- transformers/hyperparameter_search.py +2 -1
- transformers/image_processing_base.py +23 -12
- transformers/image_processing_utils.py +15 -11
- transformers/image_processing_utils_fast.py +75 -85
- transformers/image_transforms.py +42 -73
- transformers/image_utils.py +32 -30
- transformers/initialization.py +0 -37
- transformers/integrations/__init__.py +2 -16
- transformers/integrations/accelerate.py +113 -58
- transformers/integrations/aqlm.py +66 -36
- transformers/integrations/awq.py +516 -45
- transformers/integrations/bitnet.py +105 -47
- transformers/integrations/bitsandbytes.py +202 -91
- transformers/integrations/deepspeed.py +4 -161
- transformers/integrations/eetq.py +82 -84
- transformers/integrations/executorch.py +1 -1
- transformers/integrations/fbgemm_fp8.py +145 -190
- transformers/integrations/finegrained_fp8.py +215 -249
- transformers/integrations/flash_attention.py +3 -3
- transformers/integrations/flex_attention.py +1 -1
- transformers/integrations/fp_quant.py +0 -90
- transformers/integrations/ggml.py +2 -11
- transformers/integrations/higgs.py +62 -37
- transformers/integrations/hub_kernels.py +8 -65
- transformers/integrations/integration_utils.py +3 -47
- transformers/integrations/mistral.py +0 -12
- transformers/integrations/mxfp4.py +80 -33
- transformers/integrations/peft.py +191 -483
- transformers/integrations/quanto.py +56 -77
- transformers/integrations/spqr.py +90 -42
- transformers/integrations/tensor_parallel.py +221 -167
- transformers/integrations/torchao.py +43 -35
- transformers/integrations/vptq.py +59 -40
- transformers/kernels/__init__.py +0 -0
- transformers/{models/pe_audio_video/processing_pe_audio_video.py → kernels/falcon_mamba/__init__.py} +3 -12
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +529 -0
- transformers/loss/loss_utils.py +0 -2
- transformers/masking_utils.py +55 -51
- transformers/model_debugging_utils.py +5 -4
- transformers/modelcard.py +194 -15
- transformers/modeling_attn_mask_utils.py +19 -19
- transformers/modeling_flash_attention_utils.py +27 -27
- transformers/modeling_gguf_pytorch_utils.py +24 -79
- transformers/modeling_layers.py +22 -21
- transformers/modeling_outputs.py +253 -242
- transformers/modeling_rope_utils.py +117 -138
- transformers/modeling_utils.py +739 -850
- transformers/models/__init__.py +0 -27
- transformers/models/afmoe/configuration_afmoe.py +33 -40
- transformers/models/afmoe/modeling_afmoe.py +54 -42
- transformers/models/afmoe/modular_afmoe.py +33 -23
- transformers/models/aimv2/configuration_aimv2.py +10 -2
- transformers/models/aimv2/modeling_aimv2.py +42 -47
- transformers/models/aimv2/modular_aimv2.py +19 -17
- transformers/models/albert/configuration_albert.py +2 -8
- transformers/models/albert/modeling_albert.py +69 -70
- transformers/models/albert/tokenization_albert.py +14 -5
- transformers/models/align/configuration_align.py +6 -8
- transformers/models/align/modeling_align.py +89 -94
- transformers/models/align/processing_align.py +30 -2
- transformers/models/altclip/configuration_altclip.py +7 -4
- transformers/models/altclip/modeling_altclip.py +103 -114
- transformers/models/altclip/processing_altclip.py +15 -2
- transformers/models/apertus/__init__.py +1 -0
- transformers/models/apertus/configuration_apertus.py +28 -23
- transformers/models/apertus/modeling_apertus.py +40 -39
- transformers/models/apertus/modular_apertus.py +38 -37
- transformers/models/arcee/configuration_arcee.py +30 -25
- transformers/models/arcee/modeling_arcee.py +39 -36
- transformers/models/arcee/modular_arcee.py +23 -20
- transformers/models/aria/configuration_aria.py +44 -31
- transformers/models/aria/image_processing_aria.py +27 -25
- transformers/models/aria/modeling_aria.py +106 -110
- transformers/models/aria/modular_aria.py +127 -118
- transformers/models/aria/processing_aria.py +35 -28
- transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +1 -0
- transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +6 -3
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +8 -6
- transformers/models/audioflamingo3/__init__.py +1 -0
- transformers/models/audioflamingo3/configuration_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +49 -58
- transformers/models/audioflamingo3/modular_audioflamingo3.py +43 -53
- transformers/models/audioflamingo3/processing_audioflamingo3.py +30 -33
- transformers/models/auto/auto_factory.py +7 -6
- transformers/models/auto/configuration_auto.py +5 -66
- transformers/models/auto/feature_extraction_auto.py +10 -14
- transformers/models/auto/image_processing_auto.py +41 -32
- transformers/models/auto/modeling_auto.py +188 -46
- transformers/models/auto/processing_auto.py +11 -24
- transformers/models/auto/tokenization_auto.py +588 -171
- transformers/models/auto/video_processing_auto.py +10 -12
- transformers/models/autoformer/configuration_autoformer.py +7 -4
- transformers/models/autoformer/modeling_autoformer.py +101 -104
- transformers/models/aya_vision/configuration_aya_vision.py +1 -4
- transformers/models/aya_vision/modeling_aya_vision.py +102 -71
- transformers/models/aya_vision/modular_aya_vision.py +74 -46
- transformers/models/aya_vision/processing_aya_vision.py +53 -25
- transformers/models/bamba/configuration_bamba.py +39 -34
- transformers/models/bamba/modeling_bamba.py +86 -82
- transformers/models/bamba/modular_bamba.py +72 -70
- transformers/models/bark/configuration_bark.py +8 -6
- transformers/models/bark/generation_configuration_bark.py +5 -3
- transformers/models/bark/modeling_bark.py +57 -54
- transformers/models/bark/processing_bark.py +41 -19
- transformers/models/bart/configuration_bart.py +6 -9
- transformers/models/bart/modeling_bart.py +126 -135
- transformers/models/barthez/tokenization_barthez.py +11 -3
- transformers/models/bartpho/tokenization_bartpho.py +7 -6
- transformers/models/beit/configuration_beit.py +11 -0
- transformers/models/beit/image_processing_beit.py +56 -53
- transformers/models/beit/image_processing_beit_fast.py +12 -10
- transformers/models/beit/modeling_beit.py +60 -69
- transformers/models/bert/configuration_bert.py +2 -12
- transformers/models/bert/modeling_bert.py +122 -114
- transformers/models/bert/tokenization_bert.py +23 -8
- transformers/models/bert/tokenization_bert_legacy.py +5 -3
- transformers/models/bert_generation/configuration_bert_generation.py +2 -17
- transformers/models/bert_generation/modeling_bert_generation.py +49 -49
- transformers/models/bert_generation/tokenization_bert_generation.py +3 -2
- transformers/models/bert_japanese/tokenization_bert_japanese.py +6 -5
- transformers/models/bertweet/tokenization_bertweet.py +3 -1
- transformers/models/big_bird/configuration_big_bird.py +9 -12
- transformers/models/big_bird/modeling_big_bird.py +109 -116
- transformers/models/big_bird/tokenization_big_bird.py +43 -16
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -9
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +117 -130
- transformers/models/biogpt/configuration_biogpt.py +2 -8
- transformers/models/biogpt/modeling_biogpt.py +76 -72
- transformers/models/biogpt/modular_biogpt.py +66 -62
- transformers/models/biogpt/tokenization_biogpt.py +5 -3
- transformers/models/bit/configuration_bit.py +1 -0
- transformers/models/bit/image_processing_bit.py +24 -21
- transformers/models/bit/image_processing_bit_fast.py +1 -0
- transformers/models/bit/modeling_bit.py +12 -25
- transformers/models/bitnet/configuration_bitnet.py +28 -23
- transformers/models/bitnet/modeling_bitnet.py +39 -36
- transformers/models/bitnet/modular_bitnet.py +6 -4
- transformers/models/blenderbot/configuration_blenderbot.py +5 -8
- transformers/models/blenderbot/modeling_blenderbot.py +96 -77
- transformers/models/blenderbot/tokenization_blenderbot.py +24 -18
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +5 -8
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +69 -79
- transformers/models/blenderbot_small/tokenization_blenderbot_small.py +3 -1
- transformers/models/blip/configuration_blip.py +10 -9
- transformers/models/blip/image_processing_blip.py +20 -17
- transformers/models/blip/image_processing_blip_fast.py +1 -0
- transformers/models/blip/modeling_blip.py +108 -117
- transformers/models/blip/modeling_blip_text.py +65 -73
- transformers/models/blip/processing_blip.py +36 -5
- transformers/models/blip_2/configuration_blip_2.py +2 -2
- transformers/models/blip_2/modeling_blip_2.py +118 -146
- transformers/models/blip_2/processing_blip_2.py +38 -8
- transformers/models/bloom/configuration_bloom.py +2 -5
- transformers/models/bloom/modeling_bloom.py +104 -77
- transformers/models/blt/configuration_blt.py +86 -94
- transformers/models/blt/modeling_blt.py +81 -238
- transformers/models/blt/modular_blt.py +65 -228
- transformers/models/bridgetower/configuration_bridgetower.py +2 -7
- transformers/models/bridgetower/image_processing_bridgetower.py +35 -34
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +16 -13
- transformers/models/bridgetower/modeling_bridgetower.py +119 -141
- transformers/models/bridgetower/processing_bridgetower.py +16 -2
- transformers/models/bros/configuration_bros.py +18 -24
- transformers/models/bros/modeling_bros.py +80 -90
- transformers/models/bros/processing_bros.py +12 -2
- transformers/models/byt5/tokenization_byt5.py +6 -4
- transformers/models/camembert/configuration_camembert.py +2 -8
- transformers/models/camembert/modeling_camembert.py +195 -196
- transformers/models/camembert/modular_camembert.py +54 -51
- transformers/models/camembert/tokenization_camembert.py +13 -6
- transformers/models/canine/configuration_canine.py +2 -4
- transformers/models/canine/modeling_canine.py +75 -84
- transformers/models/canine/tokenization_canine.py +1 -2
- transformers/models/chameleon/configuration_chameleon.py +34 -29
- transformers/models/chameleon/image_processing_chameleon.py +24 -21
- transformers/models/chameleon/image_processing_chameleon_fast.py +6 -5
- transformers/models/chameleon/modeling_chameleon.py +93 -142
- transformers/models/chameleon/processing_chameleon.py +41 -16
- transformers/models/chinese_clip/configuration_chinese_clip.py +8 -10
- transformers/models/chinese_clip/image_processing_chinese_clip.py +24 -21
- transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +1 -0
- transformers/models/chinese_clip/modeling_chinese_clip.py +92 -96
- transformers/models/chinese_clip/processing_chinese_clip.py +15 -2
- transformers/models/clap/configuration_clap.py +9 -4
- transformers/models/clap/feature_extraction_clap.py +12 -11
- transformers/models/clap/modeling_clap.py +123 -136
- transformers/models/clap/processing_clap.py +15 -2
- transformers/models/clip/configuration_clip.py +2 -4
- transformers/models/clip/image_processing_clip.py +24 -21
- transformers/models/clip/image_processing_clip_fast.py +1 -9
- transformers/models/clip/modeling_clip.py +65 -65
- transformers/models/clip/processing_clip.py +14 -2
- transformers/models/clip/tokenization_clip.py +46 -21
- transformers/models/clipseg/configuration_clipseg.py +2 -4
- transformers/models/clipseg/modeling_clipseg.py +109 -119
- transformers/models/clipseg/processing_clipseg.py +42 -19
- transformers/models/clvp/configuration_clvp.py +5 -15
- transformers/models/clvp/feature_extraction_clvp.py +10 -7
- transformers/models/clvp/modeling_clvp.py +146 -155
- transformers/models/clvp/number_normalizer.py +2 -1
- transformers/models/clvp/processing_clvp.py +20 -3
- transformers/models/clvp/tokenization_clvp.py +64 -1
- transformers/models/code_llama/tokenization_code_llama.py +44 -18
- transformers/models/codegen/configuration_codegen.py +4 -4
- transformers/models/codegen/modeling_codegen.py +53 -63
- transformers/models/codegen/tokenization_codegen.py +47 -17
- transformers/models/cohere/configuration_cohere.py +30 -25
- transformers/models/cohere/modeling_cohere.py +42 -40
- transformers/models/cohere/modular_cohere.py +29 -26
- transformers/models/cohere/tokenization_cohere.py +46 -15
- transformers/models/cohere2/configuration_cohere2.py +32 -31
- transformers/models/cohere2/modeling_cohere2.py +44 -42
- transformers/models/cohere2/modular_cohere2.py +54 -54
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +14 -13
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +58 -59
- transformers/models/cohere2_vision/modular_cohere2_vision.py +46 -45
- transformers/models/cohere2_vision/processing_cohere2_vision.py +36 -6
- transformers/models/colpali/configuration_colpali.py +1 -0
- transformers/models/colpali/modeling_colpali.py +16 -14
- transformers/models/colpali/modular_colpali.py +51 -11
- transformers/models/colpali/processing_colpali.py +52 -14
- transformers/models/colqwen2/modeling_colqwen2.py +28 -28
- transformers/models/colqwen2/modular_colqwen2.py +74 -37
- transformers/models/colqwen2/processing_colqwen2.py +52 -16
- transformers/models/conditional_detr/configuration_conditional_detr.py +2 -1
- transformers/models/conditional_detr/image_processing_conditional_detr.py +70 -67
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +36 -36
- transformers/models/conditional_detr/modeling_conditional_detr.py +87 -99
- transformers/models/conditional_detr/modular_conditional_detr.py +3 -49
- transformers/models/convbert/configuration_convbert.py +8 -11
- transformers/models/convbert/modeling_convbert.py +87 -94
- transformers/models/convbert/tokenization_convbert.py +1 -0
- transformers/models/convnext/configuration_convnext.py +1 -0
- transformers/models/convnext/image_processing_convnext.py +23 -20
- transformers/models/convnext/image_processing_convnext_fast.py +21 -16
- transformers/models/convnext/modeling_convnext.py +12 -9
- transformers/models/convnextv2/configuration_convnextv2.py +1 -0
- transformers/models/convnextv2/modeling_convnextv2.py +12 -9
- transformers/models/cpm/tokenization_cpm.py +7 -6
- transformers/models/cpm/tokenization_cpm_fast.py +5 -3
- transformers/models/cpmant/configuration_cpmant.py +1 -4
- transformers/models/cpmant/modeling_cpmant.py +40 -38
- transformers/models/cpmant/tokenization_cpmant.py +3 -1
- transformers/models/csm/configuration_csm.py +66 -58
- transformers/models/csm/generation_csm.py +35 -31
- transformers/models/csm/modeling_csm.py +85 -85
- transformers/models/csm/modular_csm.py +58 -58
- transformers/models/csm/processing_csm.py +68 -25
- transformers/models/ctrl/configuration_ctrl.py +1 -16
- transformers/models/ctrl/modeling_ctrl.py +44 -54
- transformers/models/ctrl/tokenization_ctrl.py +1 -0
- transformers/models/cvt/configuration_cvt.py +1 -0
- transformers/models/cvt/modeling_cvt.py +16 -20
- transformers/models/cwm/__init__.py +1 -0
- transformers/models/cwm/configuration_cwm.py +12 -8
- transformers/models/cwm/modeling_cwm.py +39 -37
- transformers/models/cwm/modular_cwm.py +12 -10
- transformers/models/d_fine/configuration_d_fine.py +5 -7
- transformers/models/d_fine/modeling_d_fine.py +128 -138
- transformers/models/d_fine/modular_d_fine.py +18 -33
- transformers/models/dab_detr/configuration_dab_detr.py +3 -6
- transformers/models/dab_detr/modeling_dab_detr.py +75 -81
- transformers/models/dac/configuration_dac.py +1 -0
- transformers/models/dac/feature_extraction_dac.py +9 -6
- transformers/models/dac/modeling_dac.py +26 -24
- transformers/models/data2vec/configuration_data2vec_audio.py +2 -4
- transformers/models/data2vec/configuration_data2vec_text.py +3 -11
- transformers/models/data2vec/configuration_data2vec_vision.py +1 -0
- transformers/models/data2vec/modeling_data2vec_audio.py +56 -57
- transformers/models/data2vec/modeling_data2vec_text.py +93 -98
- transformers/models/data2vec/modeling_data2vec_vision.py +45 -49
- transformers/models/data2vec/modular_data2vec_audio.py +1 -6
- transformers/models/data2vec/modular_data2vec_text.py +54 -58
- transformers/models/dbrx/configuration_dbrx.py +22 -36
- transformers/models/dbrx/modeling_dbrx.py +45 -42
- transformers/models/dbrx/modular_dbrx.py +33 -31
- transformers/models/deberta/configuration_deberta.py +1 -6
- transformers/models/deberta/modeling_deberta.py +60 -64
- transformers/models/deberta/tokenization_deberta.py +21 -9
- transformers/models/deberta_v2/configuration_deberta_v2.py +1 -6
- transformers/models/deberta_v2/modeling_deberta_v2.py +65 -71
- transformers/models/deberta_v2/tokenization_deberta_v2.py +29 -11
- transformers/models/decision_transformer/configuration_decision_transformer.py +2 -3
- transformers/models/decision_transformer/modeling_decision_transformer.py +56 -60
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +44 -39
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +43 -43
- transformers/models/deepseek_v2/modular_deepseek_v2.py +49 -48
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +45 -40
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +42 -45
- transformers/models/deepseek_v3/modular_deepseek_v3.py +9 -14
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +3 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +26 -25
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +10 -10
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +48 -57
- transformers/models/deepseek_vl/modular_deepseek_vl.py +43 -14
- transformers/models/deepseek_vl/processing_deepseek_vl.py +41 -10
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +5 -3
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +35 -35
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +24 -20
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +61 -109
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +118 -146
- transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +44 -12
- transformers/models/deformable_detr/configuration_deformable_detr.py +3 -2
- transformers/models/deformable_detr/image_processing_deformable_detr.py +61 -59
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +28 -28
- transformers/models/deformable_detr/modeling_deformable_detr.py +82 -88
- transformers/models/deformable_detr/modular_deformable_detr.py +3 -1
- transformers/models/deit/configuration_deit.py +1 -0
- transformers/models/deit/image_processing_deit.py +21 -18
- transformers/models/deit/image_processing_deit_fast.py +1 -0
- transformers/models/deit/modeling_deit.py +22 -24
- transformers/models/depth_anything/configuration_depth_anything.py +4 -2
- transformers/models/depth_anything/modeling_depth_anything.py +10 -10
- transformers/models/depth_pro/configuration_depth_pro.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro.py +23 -22
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +10 -8
- transformers/models/depth_pro/modeling_depth_pro.py +27 -31
- transformers/models/detr/configuration_detr.py +2 -1
- transformers/models/detr/image_processing_detr.py +66 -64
- transformers/models/detr/image_processing_detr_fast.py +34 -33
- transformers/models/detr/modeling_detr.py +79 -95
- transformers/models/dia/configuration_dia.py +15 -9
- transformers/models/dia/feature_extraction_dia.py +9 -6
- transformers/models/dia/generation_dia.py +50 -48
- transformers/models/dia/modeling_dia.py +69 -78
- transformers/models/dia/modular_dia.py +56 -64
- transformers/models/dia/processing_dia.py +29 -39
- transformers/models/dia/tokenization_dia.py +6 -3
- transformers/models/diffllama/configuration_diffllama.py +30 -25
- transformers/models/diffllama/modeling_diffllama.py +49 -46
- transformers/models/diffllama/modular_diffllama.py +19 -17
- transformers/models/dinat/configuration_dinat.py +1 -0
- transformers/models/dinat/modeling_dinat.py +44 -47
- transformers/models/dinov2/configuration_dinov2.py +1 -0
- transformers/models/dinov2/modeling_dinov2.py +15 -15
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +1 -1
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +15 -16
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +9 -9
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +7 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +6 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +8 -5
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +9 -7
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +18 -19
- transformers/models/dinov3_vit/modular_dinov3_vit.py +15 -16
- transformers/models/distilbert/configuration_distilbert.py +2 -8
- transformers/models/distilbert/modeling_distilbert.py +55 -55
- transformers/models/distilbert/tokenization_distilbert.py +1 -13
- transformers/models/doge/__init__.py +1 -0
- transformers/models/doge/configuration_doge.py +32 -39
- transformers/models/doge/modeling_doge.py +49 -45
- transformers/models/doge/modular_doge.py +63 -71
- transformers/models/donut/configuration_donut_swin.py +1 -0
- transformers/models/donut/image_processing_donut.py +29 -26
- transformers/models/donut/image_processing_donut_fast.py +15 -9
- transformers/models/donut/modeling_donut_swin.py +58 -62
- transformers/models/donut/processing_donut.py +26 -5
- transformers/models/dots1/configuration_dots1.py +33 -41
- transformers/models/dots1/modeling_dots1.py +45 -54
- transformers/models/dots1/modular_dots1.py +4 -5
- transformers/models/dpr/configuration_dpr.py +2 -19
- transformers/models/dpr/modeling_dpr.py +39 -42
- transformers/models/dpr/tokenization_dpr.py +9 -19
- transformers/models/dpr/tokenization_dpr_fast.py +9 -7
- transformers/models/dpt/configuration_dpt.py +2 -1
- transformers/models/dpt/image_processing_dpt.py +66 -65
- transformers/models/dpt/image_processing_dpt_fast.py +20 -18
- transformers/models/dpt/modeling_dpt.py +30 -32
- transformers/models/dpt/modular_dpt.py +17 -15
- transformers/models/edgetam/configuration_edgetam.py +3 -2
- transformers/models/edgetam/modeling_edgetam.py +86 -86
- transformers/models/edgetam/modular_edgetam.py +26 -21
- transformers/models/edgetam_video/__init__.py +1 -0
- transformers/models/edgetam_video/configuration_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modeling_edgetam_video.py +158 -169
- transformers/models/edgetam_video/modular_edgetam_video.py +37 -30
- transformers/models/efficientloftr/configuration_efficientloftr.py +5 -4
- transformers/models/efficientloftr/image_processing_efficientloftr.py +16 -14
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +9 -9
- transformers/models/efficientloftr/modeling_efficientloftr.py +38 -59
- transformers/models/efficientloftr/modular_efficientloftr.py +3 -1
- transformers/models/efficientnet/configuration_efficientnet.py +1 -0
- transformers/models/efficientnet/image_processing_efficientnet.py +32 -28
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +19 -17
- transformers/models/efficientnet/modeling_efficientnet.py +15 -19
- transformers/models/electra/configuration_electra.py +3 -13
- transformers/models/electra/modeling_electra.py +103 -108
- transformers/models/emu3/configuration_emu3.py +17 -13
- transformers/models/emu3/image_processing_emu3.py +39 -44
- transformers/models/emu3/modeling_emu3.py +108 -148
- transformers/models/emu3/modular_emu3.py +73 -115
- transformers/models/emu3/processing_emu3.py +43 -18
- transformers/models/encodec/configuration_encodec.py +4 -2
- transformers/models/encodec/feature_extraction_encodec.py +13 -10
- transformers/models/encodec/modeling_encodec.py +29 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +2 -12
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +43 -37
- transformers/models/eomt/configuration_eomt.py +1 -0
- transformers/models/eomt/image_processing_eomt.py +56 -66
- transformers/models/eomt/image_processing_eomt_fast.py +33 -76
- transformers/models/eomt/modeling_eomt.py +18 -23
- transformers/models/eomt/modular_eomt.py +13 -18
- transformers/models/ernie/configuration_ernie.py +3 -24
- transformers/models/ernie/modeling_ernie.py +132 -127
- transformers/models/ernie/modular_ernie.py +103 -97
- transformers/models/ernie4_5/configuration_ernie4_5.py +27 -23
- transformers/models/ernie4_5/modeling_ernie4_5.py +38 -36
- transformers/models/ernie4_5/modular_ernie4_5.py +4 -3
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +36 -32
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +55 -56
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +46 -18
- transformers/models/esm/configuration_esm.py +15 -11
- transformers/models/esm/modeling_esm.py +34 -38
- transformers/models/esm/modeling_esmfold.py +49 -53
- transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
- transformers/models/esm/openfold_utils/loss.py +2 -1
- transformers/models/esm/openfold_utils/protein.py +16 -15
- transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
- transformers/models/esm/tokenization_esm.py +4 -2
- transformers/models/evolla/configuration_evolla.py +40 -50
- transformers/models/evolla/modeling_evolla.py +66 -71
- transformers/models/evolla/modular_evolla.py +47 -53
- transformers/models/evolla/processing_evolla.py +35 -23
- transformers/models/exaone4/configuration_exaone4.py +25 -23
- transformers/models/exaone4/modeling_exaone4.py +38 -35
- transformers/models/exaone4/modular_exaone4.py +46 -44
- transformers/models/falcon/configuration_falcon.py +26 -31
- transformers/models/falcon/modeling_falcon.py +80 -82
- transformers/models/falcon_h1/configuration_falcon_h1.py +51 -45
- transformers/models/falcon_h1/modeling_falcon_h1.py +82 -85
- transformers/models/falcon_h1/modular_falcon_h1.py +51 -56
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +2 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +82 -75
- transformers/models/falcon_mamba/modular_falcon_mamba.py +45 -28
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +6 -2
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +60 -76
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +3 -2
- transformers/models/flaubert/configuration_flaubert.py +5 -10
- transformers/models/flaubert/modeling_flaubert.py +143 -145
- transformers/models/flaubert/tokenization_flaubert.py +5 -3
- transformers/models/flava/configuration_flava.py +6 -5
- transformers/models/flava/image_processing_flava.py +67 -66
- transformers/models/flava/image_processing_flava_fast.py +49 -46
- transformers/models/flava/modeling_flava.py +136 -153
- transformers/models/flava/processing_flava.py +12 -2
- transformers/models/flex_olmo/__init__.py +1 -0
- transformers/models/flex_olmo/configuration_flex_olmo.py +32 -28
- transformers/models/flex_olmo/modeling_flex_olmo.py +47 -47
- transformers/models/flex_olmo/modular_flex_olmo.py +44 -40
- transformers/models/florence2/configuration_florence2.py +1 -0
- transformers/models/florence2/modeling_florence2.py +69 -111
- transformers/models/florence2/modular_florence2.py +101 -104
- transformers/models/florence2/processing_florence2.py +47 -18
- transformers/models/fnet/configuration_fnet.py +2 -6
- transformers/models/fnet/modeling_fnet.py +80 -83
- transformers/models/fnet/tokenization_fnet.py +1 -0
- transformers/models/focalnet/configuration_focalnet.py +1 -0
- transformers/models/focalnet/modeling_focalnet.py +45 -51
- transformers/models/fsmt/configuration_fsmt.py +17 -12
- transformers/models/fsmt/modeling_fsmt.py +48 -49
- transformers/models/fsmt/tokenization_fsmt.py +5 -3
- transformers/models/funnel/configuration_funnel.py +1 -8
- transformers/models/funnel/modeling_funnel.py +93 -99
- transformers/models/funnel/tokenization_funnel.py +27 -17
- transformers/models/fuyu/configuration_fuyu.py +34 -28
- transformers/models/fuyu/image_processing_fuyu.py +31 -29
- transformers/models/fuyu/image_processing_fuyu_fast.py +17 -17
- transformers/models/fuyu/modeling_fuyu.py +53 -53
- transformers/models/fuyu/processing_fuyu.py +34 -23
- transformers/models/gemma/configuration_gemma.py +30 -25
- transformers/models/gemma/modeling_gemma.py +50 -46
- transformers/models/gemma/modular_gemma.py +47 -42
- transformers/models/gemma/tokenization_gemma.py +30 -10
- transformers/models/gemma2/configuration_gemma2.py +35 -30
- transformers/models/gemma2/modeling_gemma2.py +42 -39
- transformers/models/gemma2/modular_gemma2.py +66 -63
- transformers/models/gemma3/configuration_gemma3.py +44 -44
- transformers/models/gemma3/image_processing_gemma3.py +31 -29
- transformers/models/gemma3/image_processing_gemma3_fast.py +13 -11
- transformers/models/gemma3/modeling_gemma3.py +207 -159
- transformers/models/gemma3/modular_gemma3.py +204 -153
- transformers/models/gemma3/processing_gemma3.py +5 -5
- transformers/models/gemma3n/configuration_gemma3n.py +26 -36
- transformers/models/gemma3n/feature_extraction_gemma3n.py +11 -9
- transformers/models/gemma3n/modeling_gemma3n.py +356 -222
- transformers/models/gemma3n/modular_gemma3n.py +207 -230
- transformers/models/gemma3n/processing_gemma3n.py +26 -12
- transformers/models/git/configuration_git.py +8 -5
- transformers/models/git/modeling_git.py +204 -266
- transformers/models/git/processing_git.py +14 -2
- transformers/models/glm/configuration_glm.py +28 -24
- transformers/models/glm/modeling_glm.py +40 -37
- transformers/models/glm/modular_glm.py +7 -4
- transformers/models/glm4/configuration_glm4.py +28 -24
- transformers/models/glm4/modeling_glm4.py +42 -40
- transformers/models/glm4/modular_glm4.py +10 -8
- transformers/models/glm46v/configuration_glm46v.py +1 -0
- transformers/models/glm46v/image_processing_glm46v.py +40 -35
- transformers/models/glm46v/image_processing_glm46v_fast.py +9 -9
- transformers/models/glm46v/modeling_glm46v.py +90 -137
- transformers/models/glm46v/modular_glm46v.py +3 -4
- transformers/models/glm46v/processing_glm46v.py +41 -7
- transformers/models/glm46v/video_processing_glm46v.py +11 -9
- transformers/models/glm4_moe/configuration_glm4_moe.py +32 -40
- transformers/models/glm4_moe/modeling_glm4_moe.py +42 -45
- transformers/models/glm4_moe/modular_glm4_moe.py +34 -42
- transformers/models/glm4v/configuration_glm4v.py +20 -18
- transformers/models/glm4v/image_processing_glm4v.py +40 -34
- transformers/models/glm4v/image_processing_glm4v_fast.py +9 -8
- transformers/models/glm4v/modeling_glm4v.py +205 -254
- transformers/models/glm4v/modular_glm4v.py +224 -210
- transformers/models/glm4v/processing_glm4v.py +41 -7
- transformers/models/glm4v/video_processing_glm4v.py +11 -9
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +125 -136
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +368 -377
- transformers/models/glm4v_moe/modular_glm4v_moe.py +169 -83
- transformers/models/glpn/configuration_glpn.py +1 -0
- transformers/models/glpn/image_processing_glpn.py +12 -11
- transformers/models/glpn/image_processing_glpn_fast.py +13 -11
- transformers/models/glpn/modeling_glpn.py +14 -16
- transformers/models/got_ocr2/configuration_got_ocr2.py +12 -4
- transformers/models/got_ocr2/image_processing_got_ocr2.py +24 -22
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +11 -9
- transformers/models/got_ocr2/modeling_got_ocr2.py +80 -77
- transformers/models/got_ocr2/modular_got_ocr2.py +51 -54
- transformers/models/got_ocr2/processing_got_ocr2.py +63 -42
- transformers/models/gpt2/configuration_gpt2.py +2 -13
- transformers/models/gpt2/modeling_gpt2.py +115 -120
- transformers/models/gpt2/tokenization_gpt2.py +46 -15
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +2 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +89 -79
- transformers/models/gpt_neo/configuration_gpt_neo.py +2 -9
- transformers/models/gpt_neo/modeling_gpt_neo.py +67 -83
- transformers/models/gpt_neox/configuration_gpt_neox.py +25 -25
- transformers/models/gpt_neox/modeling_gpt_neox.py +75 -76
- transformers/models/gpt_neox/modular_gpt_neox.py +66 -67
- transformers/models/gpt_neox/tokenization_gpt_neox.py +51 -9
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +19 -24
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +47 -46
- transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +28 -46
- transformers/models/gpt_oss/modeling_gpt_oss.py +121 -83
- transformers/models/gpt_oss/modular_gpt_oss.py +103 -64
- transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +87 -101
- transformers/models/granite/configuration_granite.py +33 -28
- transformers/models/granite/modeling_granite.py +46 -44
- transformers/models/granite/modular_granite.py +31 -29
- transformers/models/granite_speech/configuration_granite_speech.py +1 -0
- transformers/models/granite_speech/feature_extraction_granite_speech.py +3 -1
- transformers/models/granite_speech/modeling_granite_speech.py +52 -82
- transformers/models/granite_speech/processing_granite_speech.py +4 -11
- transformers/models/granitemoe/configuration_granitemoe.py +36 -31
- transformers/models/granitemoe/modeling_granitemoe.py +46 -41
- transformers/models/granitemoe/modular_granitemoe.py +27 -22
- transformers/models/granitemoehybrid/__init__.py +1 -0
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +47 -46
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +93 -97
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +21 -54
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +37 -33
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +61 -54
- transformers/models/granitemoeshared/modular_granitemoeshared.py +21 -19
- transformers/models/grounding_dino/configuration_grounding_dino.py +4 -6
- transformers/models/grounding_dino/image_processing_grounding_dino.py +62 -60
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +29 -28
- transformers/models/grounding_dino/modeling_grounding_dino.py +140 -155
- transformers/models/grounding_dino/modular_grounding_dino.py +3 -2
- transformers/models/grounding_dino/processing_grounding_dino.py +38 -10
- transformers/models/groupvit/configuration_groupvit.py +2 -4
- transformers/models/groupvit/modeling_groupvit.py +93 -107
- transformers/models/helium/configuration_helium.py +29 -25
- transformers/models/helium/modeling_helium.py +40 -38
- transformers/models/helium/modular_helium.py +7 -3
- transformers/models/herbert/tokenization_herbert.py +28 -10
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +1 -0
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -24
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -24
- transformers/models/hiera/configuration_hiera.py +1 -0
- transformers/models/hiera/modeling_hiera.py +66 -72
- transformers/models/hubert/configuration_hubert.py +2 -4
- transformers/models/hubert/modeling_hubert.py +37 -42
- transformers/models/hubert/modular_hubert.py +11 -13
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +31 -26
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +38 -35
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +6 -4
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +36 -31
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +42 -47
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +9 -9
- transformers/models/ibert/configuration_ibert.py +2 -4
- transformers/models/ibert/modeling_ibert.py +62 -82
- transformers/models/ibert/quant_modules.py +1 -0
- transformers/models/idefics/configuration_idefics.py +8 -5
- transformers/models/idefics/image_processing_idefics.py +15 -13
- transformers/models/idefics/modeling_idefics.py +82 -75
- transformers/models/idefics/perceiver.py +3 -1
- transformers/models/idefics/processing_idefics.py +48 -32
- transformers/models/idefics/vision.py +25 -24
- transformers/models/idefics2/configuration_idefics2.py +3 -1
- transformers/models/idefics2/image_processing_idefics2.py +32 -31
- transformers/models/idefics2/image_processing_idefics2_fast.py +8 -8
- transformers/models/idefics2/modeling_idefics2.py +101 -127
- transformers/models/idefics2/processing_idefics2.py +68 -10
- transformers/models/idefics3/configuration_idefics3.py +4 -1
- transformers/models/idefics3/image_processing_idefics3.py +43 -42
- transformers/models/idefics3/image_processing_idefics3_fast.py +15 -40
- transformers/models/idefics3/modeling_idefics3.py +90 -115
- transformers/models/idefics3/processing_idefics3.py +69 -15
- transformers/models/ijepa/configuration_ijepa.py +1 -0
- transformers/models/ijepa/modeling_ijepa.py +11 -10
- transformers/models/ijepa/modular_ijepa.py +7 -5
- transformers/models/imagegpt/configuration_imagegpt.py +2 -9
- transformers/models/imagegpt/image_processing_imagegpt.py +18 -17
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +16 -11
- transformers/models/imagegpt/modeling_imagegpt.py +65 -76
- transformers/models/informer/configuration_informer.py +9 -6
- transformers/models/informer/modeling_informer.py +86 -88
- transformers/models/informer/modular_informer.py +16 -14
- transformers/models/instructblip/configuration_instructblip.py +2 -2
- transformers/models/instructblip/modeling_instructblip.py +63 -103
- transformers/models/instructblip/processing_instructblip.py +36 -10
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -2
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +139 -157
- transformers/models/instructblipvideo/modular_instructblipvideo.py +64 -73
- transformers/models/instructblipvideo/processing_instructblipvideo.py +33 -14
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +8 -6
- transformers/models/internvl/configuration_internvl.py +1 -0
- transformers/models/internvl/modeling_internvl.py +106 -85
- transformers/models/internvl/modular_internvl.py +67 -47
- transformers/models/internvl/processing_internvl.py +45 -12
- transformers/models/internvl/video_processing_internvl.py +12 -10
- transformers/models/jamba/configuration_jamba.py +8 -5
- transformers/models/jamba/modeling_jamba.py +66 -68
- transformers/models/jamba/modular_jamba.py +55 -54
- transformers/models/janus/configuration_janus.py +1 -0
- transformers/models/janus/image_processing_janus.py +37 -35
- transformers/models/janus/image_processing_janus_fast.py +20 -18
- transformers/models/janus/modeling_janus.py +191 -115
- transformers/models/janus/modular_janus.py +84 -133
- transformers/models/janus/processing_janus.py +43 -17
- transformers/models/jetmoe/configuration_jetmoe.py +26 -24
- transformers/models/jetmoe/modeling_jetmoe.py +46 -43
- transformers/models/jetmoe/modular_jetmoe.py +33 -31
- transformers/models/kosmos2/configuration_kosmos2.py +9 -10
- transformers/models/kosmos2/modeling_kosmos2.py +173 -208
- transformers/models/kosmos2/processing_kosmos2.py +55 -40
- transformers/models/kosmos2_5/__init__.py +1 -0
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +9 -8
- transformers/models/kosmos2_5/image_processing_kosmos2_5.py +12 -10
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +13 -4
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +118 -132
- transformers/models/kosmos2_5/processing_kosmos2_5.py +29 -8
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +28 -31
- transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +14 -12
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +100 -110
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +22 -28
- transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +8 -2
- transformers/models/layoutlm/configuration_layoutlm.py +2 -14
- transformers/models/layoutlm/modeling_layoutlm.py +72 -77
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +17 -14
- transformers/models/layoutlmv2/image_processing_layoutlmv2.py +21 -18
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +9 -7
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +50 -64
- transformers/models/layoutlmv2/processing_layoutlmv2.py +44 -14
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +126 -73
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +19 -16
- transformers/models/layoutlmv3/image_processing_layoutlmv3.py +26 -24
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +11 -9
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +56 -82
- transformers/models/layoutlmv3/processing_layoutlmv3.py +46 -14
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +134 -74
- transformers/models/layoutxlm/configuration_layoutxlm.py +17 -14
- transformers/models/layoutxlm/modular_layoutxlm.py +1 -0
- transformers/models/layoutxlm/processing_layoutxlm.py +44 -14
- transformers/models/layoutxlm/tokenization_layoutxlm.py +113 -77
- transformers/models/led/configuration_led.py +12 -8
- transformers/models/led/modeling_led.py +266 -124
- transformers/models/levit/configuration_levit.py +1 -0
- transformers/models/levit/image_processing_levit.py +21 -19
- transformers/models/levit/image_processing_levit_fast.py +5 -4
- transformers/models/levit/modeling_levit.py +19 -38
- transformers/models/lfm2/configuration_lfm2.py +30 -27
- transformers/models/lfm2/modeling_lfm2.py +50 -47
- transformers/models/lfm2/modular_lfm2.py +30 -29
- transformers/models/lfm2_moe/__init__.py +1 -0
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +9 -6
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +53 -61
- transformers/models/lfm2_moe/modular_lfm2_moe.py +37 -13
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +1 -4
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +12 -41
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +66 -84
- transformers/models/lfm2_vl/modular_lfm2_vl.py +56 -70
- transformers/models/lfm2_vl/processing_lfm2_vl.py +76 -96
- transformers/models/lightglue/image_processing_lightglue.py +15 -16
- transformers/models/lightglue/image_processing_lightglue_fast.py +9 -9
- transformers/models/lightglue/modeling_lightglue.py +31 -31
- transformers/models/lightglue/modular_lightglue.py +28 -29
- transformers/models/lilt/configuration_lilt.py +2 -6
- transformers/models/lilt/modeling_lilt.py +70 -76
- transformers/models/llama/configuration_llama.py +31 -26
- transformers/models/llama/modeling_llama.py +39 -36
- transformers/models/llama/tokenization_llama.py +44 -14
- transformers/models/llama4/configuration_llama4.py +30 -27
- transformers/models/llama4/image_processing_llama4_fast.py +14 -12
- transformers/models/llama4/modeling_llama4.py +113 -120
- transformers/models/llama4/processing_llama4.py +57 -33
- transformers/models/llava/configuration_llava.py +1 -10
- transformers/models/llava/image_processing_llava.py +28 -25
- transformers/models/llava/image_processing_llava_fast.py +11 -9
- transformers/models/llava/modeling_llava.py +109 -85
- transformers/models/llava/processing_llava.py +51 -18
- transformers/models/llava_next/configuration_llava_next.py +2 -2
- transformers/models/llava_next/image_processing_llava_next.py +45 -43
- transformers/models/llava_next/image_processing_llava_next_fast.py +13 -11
- transformers/models/llava_next/modeling_llava_next.py +107 -110
- transformers/models/llava_next/processing_llava_next.py +47 -18
- transformers/models/llava_next_video/configuration_llava_next_video.py +7 -4
- transformers/models/llava_next_video/modeling_llava_next_video.py +158 -175
- transformers/models/llava_next_video/modular_llava_next_video.py +150 -155
- transformers/models/llava_next_video/processing_llava_next_video.py +63 -21
- transformers/models/llava_next_video/video_processing_llava_next_video.py +1 -0
- transformers/models/llava_onevision/configuration_llava_onevision.py +7 -4
- transformers/models/llava_onevision/image_processing_llava_onevision.py +42 -40
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +15 -14
- transformers/models/llava_onevision/modeling_llava_onevision.py +169 -177
- transformers/models/llava_onevision/modular_llava_onevision.py +156 -163
- transformers/models/llava_onevision/processing_llava_onevision.py +53 -21
- transformers/models/llava_onevision/video_processing_llava_onevision.py +1 -0
- transformers/models/longcat_flash/__init__.py +1 -0
- transformers/models/longcat_flash/configuration_longcat_flash.py +42 -37
- transformers/models/longcat_flash/modeling_longcat_flash.py +36 -36
- transformers/models/longcat_flash/modular_longcat_flash.py +21 -21
- transformers/models/longformer/configuration_longformer.py +5 -5
- transformers/models/longformer/modeling_longformer.py +101 -105
- transformers/models/longt5/configuration_longt5.py +7 -9
- transformers/models/longt5/modeling_longt5.py +49 -49
- transformers/models/luke/configuration_luke.py +2 -8
- transformers/models/luke/modeling_luke.py +181 -188
- transformers/models/luke/tokenization_luke.py +140 -107
- transformers/models/lxmert/configuration_lxmert.py +1 -16
- transformers/models/lxmert/modeling_lxmert.py +74 -65
- transformers/models/m2m_100/configuration_m2m_100.py +9 -7
- transformers/models/m2m_100/modeling_m2m_100.py +71 -83
- transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
- transformers/models/mamba/configuration_mamba.py +2 -1
- transformers/models/mamba/modeling_mamba.py +66 -58
- transformers/models/mamba2/configuration_mamba2.py +8 -5
- transformers/models/mamba2/modeling_mamba2.py +69 -68
- transformers/models/marian/configuration_marian.py +5 -10
- transformers/models/marian/modeling_marian.py +87 -93
- transformers/models/marian/tokenization_marian.py +6 -6
- transformers/models/markuplm/configuration_markuplm.py +7 -4
- transformers/models/markuplm/feature_extraction_markuplm.py +2 -1
- transformers/models/markuplm/modeling_markuplm.py +70 -69
- transformers/models/markuplm/processing_markuplm.py +38 -31
- transformers/models/markuplm/tokenization_markuplm.py +136 -93
- transformers/models/mask2former/configuration_mask2former.py +8 -5
- transformers/models/mask2former/image_processing_mask2former.py +85 -84
- transformers/models/mask2former/image_processing_mask2former_fast.py +40 -37
- transformers/models/mask2former/modeling_mask2former.py +103 -118
- transformers/models/mask2former/modular_mask2former.py +8 -6
- transformers/models/maskformer/configuration_maskformer.py +9 -6
- transformers/models/maskformer/configuration_maskformer_swin.py +1 -0
- transformers/models/maskformer/image_processing_maskformer.py +85 -84
- transformers/models/maskformer/image_processing_maskformer_fast.py +40 -36
- transformers/models/maskformer/modeling_maskformer.py +65 -79
- transformers/models/maskformer/modeling_maskformer_swin.py +32 -36
- transformers/models/mbart/configuration_mbart.py +4 -9
- transformers/models/mbart/modeling_mbart.py +116 -131
- transformers/models/mbart/tokenization_mbart.py +54 -11
- transformers/models/mbart50/tokenization_mbart50.py +13 -8
- transformers/models/megatron_bert/configuration_megatron_bert.py +3 -13
- transformers/models/megatron_bert/modeling_megatron_bert.py +150 -148
- transformers/models/metaclip_2/configuration_metaclip_2.py +1 -4
- transformers/models/metaclip_2/modeling_metaclip_2.py +84 -91
- transformers/models/metaclip_2/modular_metaclip_2.py +45 -61
- transformers/models/mgp_str/configuration_mgp_str.py +1 -0
- transformers/models/mgp_str/modeling_mgp_str.py +18 -20
- transformers/models/mgp_str/processing_mgp_str.py +20 -3
- transformers/models/mgp_str/tokenization_mgp_str.py +3 -1
- transformers/models/mimi/configuration_mimi.py +40 -42
- transformers/models/mimi/modeling_mimi.py +113 -142
- transformers/models/minimax/__init__.py +1 -0
- transformers/models/minimax/configuration_minimax.py +43 -37
- transformers/models/minimax/modeling_minimax.py +51 -61
- transformers/models/minimax/modular_minimax.py +62 -68
- transformers/models/ministral/configuration_ministral.py +29 -25
- transformers/models/ministral/modeling_ministral.py +38 -36
- transformers/models/ministral/modular_ministral.py +37 -32
- transformers/models/ministral3/configuration_ministral3.py +27 -24
- transformers/models/ministral3/modeling_ministral3.py +37 -36
- transformers/models/ministral3/modular_ministral3.py +5 -4
- transformers/models/mistral/configuration_mistral.py +29 -24
- transformers/models/mistral/modeling_mistral.py +37 -36
- transformers/models/mistral/modular_mistral.py +12 -11
- transformers/models/mistral3/configuration_mistral3.py +1 -4
- transformers/models/mistral3/modeling_mistral3.py +86 -89
- transformers/models/mistral3/modular_mistral3.py +68 -69
- transformers/models/mixtral/configuration_mixtral.py +34 -29
- transformers/models/mixtral/modeling_mixtral.py +45 -50
- transformers/models/mixtral/modular_mixtral.py +31 -32
- transformers/models/mlcd/configuration_mlcd.py +1 -0
- transformers/models/mlcd/modeling_mlcd.py +14 -20
- transformers/models/mlcd/modular_mlcd.py +13 -17
- transformers/models/mllama/configuration_mllama.py +15 -10
- transformers/models/mllama/image_processing_mllama.py +25 -23
- transformers/models/mllama/image_processing_mllama_fast.py +11 -11
- transformers/models/mllama/modeling_mllama.py +94 -105
- transformers/models/mllama/processing_mllama.py +55 -6
- transformers/models/mluke/tokenization_mluke.py +107 -101
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +3 -5
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +140 -155
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +3 -5
- transformers/models/mobilebert/configuration_mobilebert.py +2 -4
- transformers/models/mobilebert/modeling_mobilebert.py +85 -77
- transformers/models/mobilebert/tokenization_mobilebert.py +1 -0
- transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +1 -0
- transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +23 -20
- transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +1 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +16 -15
- transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +1 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +51 -48
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +15 -13
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +22 -24
- transformers/models/mobilevit/configuration_mobilevit.py +1 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +49 -46
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +14 -12
- transformers/models/mobilevit/modeling_mobilevit.py +21 -28
- transformers/models/mobilevitv2/configuration_mobilevitv2.py +1 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +22 -28
- transformers/models/modernbert/configuration_modernbert.py +42 -44
- transformers/models/modernbert/modeling_modernbert.py +133 -145
- transformers/models/modernbert/modular_modernbert.py +170 -186
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +40 -40
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +57 -62
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +86 -94
- transformers/models/moonshine/configuration_moonshine.py +31 -34
- transformers/models/moonshine/modeling_moonshine.py +71 -71
- transformers/models/moonshine/modular_moonshine.py +83 -88
- transformers/models/moshi/configuration_moshi.py +23 -46
- transformers/models/moshi/modeling_moshi.py +187 -157
- transformers/models/mpnet/configuration_mpnet.py +2 -6
- transformers/models/mpnet/modeling_mpnet.py +57 -62
- transformers/models/mpnet/tokenization_mpnet.py +15 -4
- transformers/models/mpt/configuration_mpt.py +9 -5
- transformers/models/mpt/modeling_mpt.py +60 -60
- transformers/models/mra/configuration_mra.py +2 -8
- transformers/models/mra/modeling_mra.py +57 -64
- transformers/models/mt5/configuration_mt5.py +8 -10
- transformers/models/mt5/modeling_mt5.py +95 -87
- transformers/models/musicgen/configuration_musicgen.py +8 -12
- transformers/models/musicgen/modeling_musicgen.py +122 -118
- transformers/models/musicgen/processing_musicgen.py +21 -3
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +8 -15
- transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +9 -8
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +123 -117
- transformers/models/musicgen_melody/processing_musicgen_melody.py +22 -3
- transformers/models/mvp/configuration_mvp.py +5 -8
- transformers/models/mvp/modeling_mvp.py +123 -135
- transformers/models/myt5/tokenization_myt5.py +10 -8
- transformers/models/nanochat/configuration_nanochat.py +8 -5
- transformers/models/nanochat/modeling_nanochat.py +40 -37
- transformers/models/nanochat/modular_nanochat.py +14 -12
- transformers/models/nemotron/configuration_nemotron.py +30 -25
- transformers/models/nemotron/modeling_nemotron.py +57 -56
- transformers/models/nllb/tokenization_nllb.py +28 -12
- transformers/models/nllb_moe/configuration_nllb_moe.py +9 -7
- transformers/models/nllb_moe/modeling_nllb_moe.py +69 -77
- transformers/models/nougat/image_processing_nougat.py +32 -29
- transformers/models/nougat/image_processing_nougat_fast.py +14 -12
- transformers/models/nougat/processing_nougat.py +39 -37
- transformers/models/nougat/tokenization_nougat.py +73 -18
- transformers/models/nystromformer/configuration_nystromformer.py +2 -8
- transformers/models/nystromformer/modeling_nystromformer.py +63 -74
- transformers/models/olmo/configuration_olmo.py +28 -23
- transformers/models/olmo/modeling_olmo.py +39 -36
- transformers/models/olmo/modular_olmo.py +11 -7
- transformers/models/olmo2/configuration_olmo2.py +28 -23
- transformers/models/olmo2/modeling_olmo2.py +41 -37
- transformers/models/olmo2/modular_olmo2.py +32 -29
- transformers/models/olmo3/__init__.py +1 -0
- transformers/models/olmo3/configuration_olmo3.py +30 -26
- transformers/models/olmo3/modeling_olmo3.py +39 -36
- transformers/models/olmo3/modular_olmo3.py +40 -37
- transformers/models/olmoe/configuration_olmoe.py +33 -29
- transformers/models/olmoe/modeling_olmoe.py +46 -52
- transformers/models/olmoe/modular_olmoe.py +15 -16
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +4 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +47 -53
- transformers/models/omdet_turbo/processing_omdet_turbo.py +67 -19
- transformers/models/oneformer/configuration_oneformer.py +8 -5
- transformers/models/oneformer/image_processing_oneformer.py +84 -83
- transformers/models/oneformer/image_processing_oneformer_fast.py +42 -41
- transformers/models/oneformer/modeling_oneformer.py +171 -147
- transformers/models/oneformer/processing_oneformer.py +43 -28
- transformers/models/openai/configuration_openai.py +1 -16
- transformers/models/openai/modeling_openai.py +51 -65
- transformers/models/openai/tokenization_openai.py +47 -8
- transformers/models/opt/configuration_opt.py +7 -6
- transformers/models/opt/modeling_opt.py +76 -78
- transformers/models/ovis2/__init__.py +1 -0
- transformers/models/ovis2/configuration_ovis2.py +1 -0
- transformers/models/ovis2/image_processing_ovis2.py +24 -22
- transformers/models/ovis2/image_processing_ovis2_fast.py +11 -9
- transformers/models/ovis2/modeling_ovis2.py +142 -111
- transformers/models/ovis2/modular_ovis2.py +45 -90
- transformers/models/ovis2/processing_ovis2.py +40 -12
- transformers/models/owlv2/configuration_owlv2.py +2 -4
- transformers/models/owlv2/image_processing_owlv2.py +21 -20
- transformers/models/owlv2/image_processing_owlv2_fast.py +15 -12
- transformers/models/owlv2/modeling_owlv2.py +117 -133
- transformers/models/owlv2/modular_owlv2.py +14 -11
- transformers/models/owlv2/processing_owlv2.py +49 -20
- transformers/models/owlvit/configuration_owlvit.py +2 -4
- transformers/models/owlvit/image_processing_owlvit.py +22 -21
- transformers/models/owlvit/image_processing_owlvit_fast.py +3 -2
- transformers/models/owlvit/modeling_owlvit.py +116 -132
- transformers/models/owlvit/processing_owlvit.py +48 -20
- transformers/models/paligemma/configuration_paligemma.py +1 -4
- transformers/models/paligemma/modeling_paligemma.py +93 -103
- transformers/models/paligemma/processing_paligemma.py +66 -13
- transformers/models/parakeet/configuration_parakeet.py +14 -7
- transformers/models/parakeet/feature_extraction_parakeet.py +12 -10
- transformers/models/parakeet/modeling_parakeet.py +28 -32
- transformers/models/parakeet/modular_parakeet.py +20 -23
- transformers/models/parakeet/processing_parakeet.py +5 -13
- transformers/models/parakeet/{tokenization_parakeet.py → tokenization_parakeet_fast.py} +7 -5
- transformers/models/patchtsmixer/configuration_patchtsmixer.py +8 -5
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +62 -70
- transformers/models/patchtst/configuration_patchtst.py +9 -6
- transformers/models/patchtst/modeling_patchtst.py +80 -97
- transformers/models/pegasus/configuration_pegasus.py +5 -8
- transformers/models/pegasus/modeling_pegasus.py +66 -72
- transformers/models/pegasus/tokenization_pegasus.py +45 -15
- transformers/models/pegasus_x/configuration_pegasus_x.py +4 -5
- transformers/models/pegasus_x/modeling_pegasus_x.py +52 -55
- transformers/models/perceiver/configuration_perceiver.py +1 -0
- transformers/models/perceiver/image_processing_perceiver.py +25 -22
- transformers/models/perceiver/image_processing_perceiver_fast.py +9 -7
- transformers/models/perceiver/modeling_perceiver.py +146 -165
- transformers/models/perceiver/tokenization_perceiver.py +6 -3
- transformers/models/perception_lm/configuration_perception_lm.py +1 -0
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +10 -8
- transformers/models/perception_lm/modeling_perception_lm.py +70 -71
- transformers/models/perception_lm/modular_perception_lm.py +61 -65
- transformers/models/perception_lm/processing_perception_lm.py +47 -13
- transformers/models/perception_lm/video_processing_perception_lm.py +1 -0
- transformers/models/persimmon/configuration_persimmon.py +28 -23
- transformers/models/persimmon/modeling_persimmon.py +45 -43
- transformers/models/phi/configuration_phi.py +28 -23
- transformers/models/phi/modeling_phi.py +43 -40
- transformers/models/phi/modular_phi.py +24 -23
- transformers/models/phi3/configuration_phi3.py +33 -28
- transformers/models/phi3/modeling_phi3.py +38 -36
- transformers/models/phi3/modular_phi3.py +17 -13
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +33 -30
- transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +9 -7
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +11 -11
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +78 -95
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +80 -98
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +44 -7
- transformers/models/phimoe/configuration_phimoe.py +36 -31
- transformers/models/phimoe/modeling_phimoe.py +45 -50
- transformers/models/phimoe/modular_phimoe.py +4 -3
- transformers/models/phobert/tokenization_phobert.py +6 -4
- transformers/models/pix2struct/configuration_pix2struct.py +10 -12
- transformers/models/pix2struct/image_processing_pix2struct.py +19 -15
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +15 -12
- transformers/models/pix2struct/modeling_pix2struct.py +52 -58
- transformers/models/pix2struct/processing_pix2struct.py +30 -5
- transformers/models/pixtral/configuration_pixtral.py +14 -11
- transformers/models/pixtral/image_processing_pixtral.py +28 -26
- transformers/models/pixtral/image_processing_pixtral_fast.py +11 -10
- transformers/models/pixtral/modeling_pixtral.py +34 -28
- transformers/models/pixtral/processing_pixtral.py +53 -21
- transformers/models/plbart/configuration_plbart.py +5 -8
- transformers/models/plbart/modeling_plbart.py +106 -119
- transformers/models/plbart/modular_plbart.py +33 -39
- transformers/models/plbart/tokenization_plbart.py +7 -4
- transformers/models/poolformer/configuration_poolformer.py +1 -0
- transformers/models/poolformer/image_processing_poolformer.py +24 -21
- transformers/models/poolformer/image_processing_poolformer_fast.py +15 -13
- transformers/models/poolformer/modeling_poolformer.py +13 -23
- transformers/models/pop2piano/configuration_pop2piano.py +8 -7
- transformers/models/pop2piano/feature_extraction_pop2piano.py +9 -6
- transformers/models/pop2piano/modeling_pop2piano.py +24 -26
- transformers/models/pop2piano/processing_pop2piano.py +33 -25
- transformers/models/pop2piano/tokenization_pop2piano.py +23 -15
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +3 -3
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +21 -20
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +13 -16
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +13 -16
- transformers/models/prophetnet/configuration_prophetnet.py +38 -37
- transformers/models/prophetnet/modeling_prophetnet.py +131 -114
- transformers/models/prophetnet/tokenization_prophetnet.py +16 -14
- transformers/models/pvt/configuration_pvt.py +1 -0
- transformers/models/pvt/image_processing_pvt.py +27 -24
- transformers/models/pvt/image_processing_pvt_fast.py +2 -1
- transformers/models/pvt/modeling_pvt.py +21 -21
- transformers/models/pvt_v2/configuration_pvt_v2.py +4 -2
- transformers/models/pvt_v2/modeling_pvt_v2.py +25 -28
- transformers/models/qwen2/configuration_qwen2.py +25 -32
- transformers/models/qwen2/modeling_qwen2.py +38 -36
- transformers/models/qwen2/modular_qwen2.py +12 -11
- transformers/models/qwen2/tokenization_qwen2.py +23 -12
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +26 -32
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +277 -340
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +211 -278
- transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +49 -41
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +35 -29
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +148 -203
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +118 -93
- transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +43 -7
- transformers/models/qwen2_audio/configuration_qwen2_audio.py +1 -0
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +40 -40
- transformers/models/qwen2_audio/processing_qwen2_audio.py +42 -13
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +35 -42
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +46 -51
- transformers/models/qwen2_moe/modular_qwen2_moe.py +10 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +34 -29
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +42 -41
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +15 -12
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +153 -199
- transformers/models/qwen2_vl/processing_qwen2_vl.py +44 -7
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +18 -38
- transformers/models/qwen3/configuration_qwen3.py +27 -34
- transformers/models/qwen3/modeling_qwen3.py +39 -36
- transformers/models/qwen3/modular_qwen3.py +6 -4
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +32 -39
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +46 -51
- transformers/models/qwen3_moe/modular_qwen3_moe.py +13 -10
- transformers/models/qwen3_next/configuration_qwen3_next.py +35 -45
- transformers/models/qwen3_next/modeling_qwen3_next.py +51 -47
- transformers/models/qwen3_next/modular_qwen3_next.py +35 -34
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +101 -135
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +252 -355
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +196 -250
- transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +48 -40
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +29 -27
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +155 -233
- transformers/models/qwen3_vl/modular_qwen3_vl.py +179 -206
- transformers/models/qwen3_vl/processing_qwen3_vl.py +42 -6
- transformers/models/qwen3_vl/video_processing_qwen3_vl.py +12 -10
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +30 -23
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +303 -358
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +124 -87
- transformers/models/rag/configuration_rag.py +15 -6
- transformers/models/rag/modeling_rag.py +130 -127
- transformers/models/rag/retrieval_rag.py +5 -3
- transformers/models/rag/tokenization_rag.py +50 -0
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +30 -29
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +42 -53
- transformers/models/reformer/configuration_reformer.py +8 -7
- transformers/models/reformer/modeling_reformer.py +69 -80
- transformers/models/reformer/tokenization_reformer.py +31 -11
- transformers/models/regnet/configuration_regnet.py +1 -0
- transformers/models/regnet/modeling_regnet.py +8 -15
- transformers/models/rembert/configuration_rembert.py +2 -8
- transformers/models/rembert/modeling_rembert.py +111 -121
- transformers/models/rembert/tokenization_rembert.py +12 -2
- transformers/models/resnet/configuration_resnet.py +1 -0
- transformers/models/resnet/modeling_resnet.py +13 -27
- transformers/models/roberta/configuration_roberta.py +3 -11
- transformers/models/roberta/modeling_roberta.py +93 -94
- transformers/models/roberta/modular_roberta.py +58 -58
- transformers/models/roberta/tokenization_roberta.py +29 -17
- transformers/models/roberta/tokenization_roberta_old.py +4 -2
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +3 -11
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +93 -94
- transformers/models/roc_bert/configuration_roc_bert.py +2 -8
- transformers/models/roc_bert/modeling_roc_bert.py +121 -122
- transformers/models/roc_bert/tokenization_roc_bert.py +94 -88
- transformers/models/roformer/configuration_roformer.py +3 -13
- transformers/models/roformer/modeling_roformer.py +81 -85
- transformers/models/roformer/tokenization_roformer.py +412 -74
- transformers/models/roformer/tokenization_roformer_fast.py +160 -0
- transformers/models/roformer/tokenization_utils.py +1 -0
- transformers/models/rt_detr/configuration_rt_detr.py +2 -1
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +1 -0
- transformers/models/rt_detr/image_processing_rt_detr.py +55 -54
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +26 -26
- transformers/models/rt_detr/modeling_rt_detr.py +90 -99
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +6 -13
- transformers/models/rt_detr/modular_rt_detr.py +16 -16
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +4 -6
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +90 -101
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +12 -19
- transformers/models/rwkv/configuration_rwkv.py +4 -2
- transformers/models/rwkv/modeling_rwkv.py +32 -31
- transformers/models/sam/configuration_sam.py +1 -3
- transformers/models/sam/image_processing_sam.py +60 -59
- transformers/models/sam/image_processing_sam_fast.py +27 -25
- transformers/models/sam/modeling_sam.py +41 -47
- transformers/models/sam/processing_sam.py +27 -39
- transformers/models/sam2/configuration_sam2.py +3 -2
- transformers/models/sam2/image_processing_sam2_fast.py +15 -14
- transformers/models/sam2/modeling_sam2.py +90 -96
- transformers/models/sam2/modular_sam2.py +91 -86
- transformers/models/sam2/processing_sam2.py +47 -31
- transformers/models/sam2_video/configuration_sam2_video.py +1 -0
- transformers/models/sam2_video/modeling_sam2_video.py +144 -151
- transformers/models/sam2_video/modular_sam2_video.py +104 -101
- transformers/models/sam2_video/processing_sam2_video.py +66 -49
- transformers/models/sam2_video/video_processing_sam2_video.py +4 -1
- transformers/models/sam3/configuration_sam3.py +2 -21
- transformers/models/sam3/image_processing_sam3_fast.py +20 -17
- transformers/models/sam3/modeling_sam3.py +170 -184
- transformers/models/sam3/modular_sam3.py +8 -3
- transformers/models/sam3/processing_sam3.py +52 -37
- transformers/models/sam3_tracker/__init__.py +1 -0
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +3 -1
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +77 -82
- transformers/models/sam3_tracker/modular_sam3_tracker.py +3 -8
- transformers/models/sam3_tracker/processing_sam3_tracker.py +48 -31
- transformers/models/sam3_tracker_video/__init__.py +1 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +1 -25
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +122 -135
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +26 -35
- transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +66 -50
- transformers/models/sam3_video/configuration_sam3_video.py +1 -14
- transformers/models/sam3_video/modeling_sam3_video.py +34 -33
- transformers/models/sam3_video/processing_sam3_video.py +46 -26
- transformers/models/sam_hq/__init__.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -3
- transformers/models/sam_hq/modeling_sam_hq.py +69 -74
- transformers/models/sam_hq/modular_sam_hq.py +25 -23
- transformers/models/sam_hq/{processing_sam_hq.py → processing_samhq.py} +29 -41
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +10 -8
- transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +11 -8
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +194 -212
- transformers/models/seamless_m4t/processing_seamless_m4t.py +39 -18
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +77 -40
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +10 -8
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +196 -204
- transformers/models/seed_oss/configuration_seed_oss.py +32 -28
- transformers/models/seed_oss/modeling_seed_oss.py +35 -33
- transformers/models/seed_oss/modular_seed_oss.py +4 -3
- transformers/models/segformer/configuration_segformer.py +10 -0
- transformers/models/segformer/image_processing_segformer.py +42 -39
- transformers/models/segformer/image_processing_segformer_fast.py +12 -10
- transformers/models/segformer/modeling_segformer.py +31 -34
- transformers/models/segformer/modular_segformer.py +10 -8
- transformers/models/seggpt/configuration_seggpt.py +1 -0
- transformers/models/seggpt/image_processing_seggpt.py +41 -38
- transformers/models/seggpt/modeling_seggpt.py +38 -50
- transformers/models/sew/configuration_sew.py +2 -4
- transformers/models/sew/modeling_sew.py +36 -38
- transformers/models/sew/modular_sew.py +13 -13
- transformers/models/sew_d/configuration_sew_d.py +2 -4
- transformers/models/sew_d/modeling_sew_d.py +30 -31
- transformers/models/shieldgemma2/configuration_shieldgemma2.py +1 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +17 -16
- transformers/models/shieldgemma2/processing_shieldgemma2.py +5 -3
- transformers/models/siglip/configuration_siglip.py +2 -4
- transformers/models/siglip/image_processing_siglip.py +20 -17
- transformers/models/siglip/image_processing_siglip_fast.py +1 -0
- transformers/models/siglip/modeling_siglip.py +75 -84
- transformers/models/siglip/processing_siglip.py +14 -2
- transformers/models/siglip/tokenization_siglip.py +7 -6
- transformers/models/siglip2/configuration_siglip2.py +2 -5
- transformers/models/siglip2/image_processing_siglip2.py +16 -15
- transformers/models/siglip2/image_processing_siglip2_fast.py +7 -6
- transformers/models/siglip2/modeling_siglip2.py +129 -143
- transformers/models/siglip2/modular_siglip2.py +46 -47
- transformers/models/siglip2/processing_siglip2.py +14 -2
- transformers/models/smollm3/configuration_smollm3.py +32 -29
- transformers/models/smollm3/modeling_smollm3.py +39 -36
- transformers/models/smollm3/modular_smollm3.py +35 -33
- transformers/models/smolvlm/configuration_smolvlm.py +4 -2
- transformers/models/smolvlm/image_processing_smolvlm.py +43 -42
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +15 -41
- transformers/models/smolvlm/modeling_smolvlm.py +94 -126
- transformers/models/smolvlm/modular_smolvlm.py +39 -50
- transformers/models/smolvlm/processing_smolvlm.py +83 -15
- transformers/models/smolvlm/video_processing_smolvlm.py +18 -16
- transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +1 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +27 -26
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -9
- transformers/models/speech_to_text/feature_extraction_speech_to_text.py +13 -10
- transformers/models/speech_to_text/modeling_speech_to_text.py +54 -66
- transformers/models/speech_to_text/processing_speech_to_text.py +30 -4
- transformers/models/speech_to_text/tokenization_speech_to_text.py +6 -5
- transformers/models/speecht5/configuration_speecht5.py +9 -7
- transformers/models/speecht5/feature_extraction_speecht5.py +37 -16
- transformers/models/speecht5/modeling_speecht5.py +175 -213
- transformers/models/speecht5/number_normalizer.py +1 -0
- transformers/models/speecht5/processing_speecht5.py +37 -3
- transformers/models/speecht5/tokenization_speecht5.py +5 -4
- transformers/models/splinter/configuration_splinter.py +7 -6
- transformers/models/splinter/modeling_splinter.py +59 -71
- transformers/models/splinter/tokenization_splinter.py +30 -9
- transformers/models/squeezebert/configuration_squeezebert.py +2 -14
- transformers/models/squeezebert/modeling_squeezebert.py +62 -68
- transformers/models/squeezebert/tokenization_squeezebert.py +1 -0
- transformers/models/stablelm/configuration_stablelm.py +29 -24
- transformers/models/stablelm/modeling_stablelm.py +45 -44
- transformers/models/starcoder2/configuration_starcoder2.py +27 -30
- transformers/models/starcoder2/modeling_starcoder2.py +41 -39
- transformers/models/starcoder2/modular_starcoder2.py +16 -14
- transformers/models/superglue/configuration_superglue.py +3 -7
- transformers/models/superglue/image_processing_superglue.py +15 -15
- transformers/models/superglue/image_processing_superglue_fast.py +10 -9
- transformers/models/superglue/modeling_superglue.py +37 -42
- transformers/models/superpoint/image_processing_superpoint.py +15 -15
- transformers/models/superpoint/image_processing_superpoint_fast.py +11 -8
- transformers/models/superpoint/modeling_superpoint.py +16 -18
- transformers/models/swiftformer/configuration_swiftformer.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +14 -18
- transformers/models/swin/configuration_swin.py +1 -0
- transformers/models/swin/modeling_swin.py +86 -86
- transformers/models/swin2sr/configuration_swin2sr.py +1 -0
- transformers/models/swin2sr/image_processing_swin2sr.py +13 -10
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +8 -4
- transformers/models/swin2sr/modeling_swin2sr.py +63 -81
- transformers/models/swinv2/configuration_swinv2.py +1 -0
- transformers/models/swinv2/modeling_swinv2.py +104 -108
- transformers/models/switch_transformers/configuration_switch_transformers.py +7 -11
- transformers/models/switch_transformers/modeling_switch_transformers.py +44 -37
- transformers/models/switch_transformers/modular_switch_transformers.py +41 -34
- transformers/models/t5/configuration_t5.py +8 -14
- transformers/models/t5/modeling_t5.py +92 -88
- transformers/models/t5/tokenization_t5.py +9 -3
- transformers/models/t5gemma/configuration_t5gemma.py +41 -43
- transformers/models/t5gemma/modeling_t5gemma.py +107 -104
- transformers/models/t5gemma/modular_t5gemma.py +120 -124
- transformers/models/t5gemma2/configuration_t5gemma2.py +120 -80
- transformers/models/t5gemma2/modeling_t5gemma2.py +125 -141
- transformers/models/t5gemma2/modular_t5gemma2.py +104 -393
- transformers/models/table_transformer/configuration_table_transformer.py +2 -1
- transformers/models/table_transformer/modeling_table_transformer.py +49 -51
- transformers/models/tapas/configuration_tapas.py +2 -12
- transformers/models/tapas/modeling_tapas.py +67 -68
- transformers/models/tapas/tokenization_tapas.py +153 -115
- transformers/models/textnet/configuration_textnet.py +1 -0
- transformers/models/textnet/image_processing_textnet.py +25 -22
- transformers/models/textnet/image_processing_textnet_fast.py +10 -8
- transformers/models/textnet/modeling_textnet.py +16 -28
- transformers/models/time_series_transformer/configuration_time_series_transformer.py +8 -5
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +81 -83
- transformers/models/timesfm/configuration_timesfm.py +1 -0
- transformers/models/timesfm/modeling_timesfm.py +22 -33
- transformers/models/timesfm/modular_timesfm.py +21 -32
- transformers/models/timesformer/configuration_timesformer.py +1 -0
- transformers/models/timesformer/modeling_timesformer.py +16 -15
- transformers/models/timm_backbone/configuration_timm_backbone.py +1 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +15 -17
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -5
- transformers/models/timm_wrapper/image_processing_timm_wrapper.py +5 -4
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +29 -34
- transformers/models/trocr/configuration_trocr.py +8 -11
- transformers/models/trocr/modeling_trocr.py +44 -45
- transformers/models/trocr/processing_trocr.py +25 -5
- transformers/models/tvp/configuration_tvp.py +2 -5
- transformers/models/tvp/image_processing_tvp.py +52 -50
- transformers/models/tvp/image_processing_tvp_fast.py +15 -15
- transformers/models/tvp/modeling_tvp.py +27 -27
- transformers/models/tvp/processing_tvp.py +14 -2
- transformers/models/udop/configuration_udop.py +7 -16
- transformers/models/udop/modeling_udop.py +73 -71
- transformers/models/udop/processing_udop.py +26 -7
- transformers/models/udop/tokenization_udop.py +105 -84
- transformers/models/umt5/configuration_umt5.py +7 -8
- transformers/models/umt5/modeling_umt5.py +90 -94
- transformers/models/unispeech/configuration_unispeech.py +2 -4
- transformers/models/unispeech/modeling_unispeech.py +49 -51
- transformers/models/unispeech/modular_unispeech.py +22 -22
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +2 -4
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +65 -69
- transformers/models/unispeech_sat/modular_unispeech_sat.py +23 -23
- transformers/models/univnet/feature_extraction_univnet.py +14 -14
- transformers/models/univnet/modeling_univnet.py +8 -8
- transformers/models/upernet/configuration_upernet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +13 -11
- transformers/models/vaultgemma/__init__.py +1 -0
- transformers/models/vaultgemma/configuration_vaultgemma.py +33 -29
- transformers/models/vaultgemma/modeling_vaultgemma.py +41 -39
- transformers/models/vaultgemma/modular_vaultgemma.py +31 -29
- transformers/models/video_llama_3/configuration_video_llama_3.py +0 -4
- transformers/models/video_llama_3/image_processing_video_llama_3.py +42 -43
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +14 -12
- transformers/models/video_llama_3/modeling_video_llama_3.py +109 -157
- transformers/models/video_llama_3/modular_video_llama_3.py +146 -155
- transformers/models/video_llama_3/processing_video_llama_3.py +39 -5
- transformers/models/video_llama_3/video_processing_video_llama_3.py +23 -42
- transformers/models/video_llava/configuration_video_llava.py +1 -4
- transformers/models/video_llava/image_processing_video_llava.py +38 -35
- transformers/models/video_llava/modeling_video_llava.py +146 -146
- transformers/models/video_llava/processing_video_llava.py +78 -38
- transformers/models/video_llava/video_processing_video_llava.py +1 -0
- transformers/models/videomae/configuration_videomae.py +1 -0
- transformers/models/videomae/image_processing_videomae.py +34 -31
- transformers/models/videomae/modeling_videomae.py +17 -14
- transformers/models/videomae/video_processing_videomae.py +1 -0
- transformers/models/vilt/configuration_vilt.py +4 -6
- transformers/models/vilt/image_processing_vilt.py +30 -29
- transformers/models/vilt/image_processing_vilt_fast.py +16 -15
- transformers/models/vilt/modeling_vilt.py +90 -116
- transformers/models/vilt/processing_vilt.py +14 -2
- transformers/models/vipllava/configuration_vipllava.py +1 -4
- transformers/models/vipllava/modeling_vipllava.py +70 -99
- transformers/models/vipllava/modular_vipllava.py +54 -78
- transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +1 -0
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +27 -28
- transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +1 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +41 -46
- transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +16 -2
- transformers/models/visual_bert/configuration_visual_bert.py +2 -6
- transformers/models/visual_bert/modeling_visual_bert.py +92 -98
- transformers/models/vit/configuration_vit.py +1 -0
- transformers/models/vit/image_processing_vit.py +22 -19
- transformers/models/vit/image_processing_vit_fast.py +1 -0
- transformers/models/vit/modeling_vit.py +17 -17
- transformers/models/vit_mae/configuration_vit_mae.py +1 -0
- transformers/models/vit_mae/modeling_vit_mae.py +27 -29
- transformers/models/vit_msn/configuration_vit_msn.py +1 -0
- transformers/models/vit_msn/modeling_vit_msn.py +16 -18
- transformers/models/vitdet/configuration_vitdet.py +1 -0
- transformers/models/vitdet/modeling_vitdet.py +14 -14
- transformers/models/vitmatte/configuration_vitmatte.py +5 -2
- transformers/models/vitmatte/image_processing_vitmatte.py +18 -15
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +18 -16
- transformers/models/vitmatte/modeling_vitmatte.py +11 -14
- transformers/models/vitpose/configuration_vitpose.py +7 -4
- transformers/models/vitpose/image_processing_vitpose.py +25 -24
- transformers/models/vitpose/image_processing_vitpose_fast.py +11 -9
- transformers/models/vitpose/modeling_vitpose.py +14 -14
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +1 -0
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +10 -8
- transformers/models/vits/configuration_vits.py +1 -4
- transformers/models/vits/modeling_vits.py +42 -44
- transformers/models/vits/tokenization_vits.py +4 -3
- transformers/models/vivit/configuration_vivit.py +1 -0
- transformers/models/vivit/image_processing_vivit.py +39 -36
- transformers/models/vivit/modeling_vivit.py +8 -6
- transformers/models/vjepa2/__init__.py +1 -0
- transformers/models/vjepa2/configuration_vjepa2.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +32 -31
- transformers/models/vjepa2/video_processing_vjepa2.py +1 -0
- transformers/models/voxtral/__init__.py +1 -0
- transformers/models/voxtral/configuration_voxtral.py +2 -0
- transformers/models/voxtral/modeling_voxtral.py +47 -40
- transformers/models/voxtral/modular_voxtral.py +40 -37
- transformers/models/voxtral/processing_voxtral.py +48 -25
- transformers/models/wav2vec2/configuration_wav2vec2.py +2 -4
- transformers/models/wav2vec2/feature_extraction_wav2vec2.py +10 -7
- transformers/models/wav2vec2/modeling_wav2vec2.py +121 -73
- transformers/models/wav2vec2/processing_wav2vec2.py +35 -6
- transformers/models/wav2vec2/tokenization_wav2vec2.py +332 -20
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +2 -4
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +62 -70
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +48 -57
- transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +35 -6
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +2 -4
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +77 -90
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +30 -37
- transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +17 -16
- transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +55 -36
- transformers/models/wavlm/configuration_wavlm.py +2 -4
- transformers/models/wavlm/modeling_wavlm.py +48 -50
- transformers/models/wavlm/modular_wavlm.py +5 -4
- transformers/models/whisper/configuration_whisper.py +5 -6
- transformers/models/whisper/english_normalizer.py +4 -3
- transformers/models/whisper/feature_extraction_whisper.py +24 -9
- transformers/models/whisper/generation_whisper.py +48 -26
- transformers/models/whisper/modeling_whisper.py +73 -79
- transformers/models/whisper/processing_whisper.py +20 -3
- transformers/models/whisper/tokenization_whisper.py +43 -11
- transformers/models/x_clip/configuration_x_clip.py +2 -4
- transformers/models/x_clip/modeling_x_clip.py +93 -96
- transformers/models/x_clip/processing_x_clip.py +14 -2
- transformers/models/xcodec/configuration_xcodec.py +6 -4
- transformers/models/xcodec/modeling_xcodec.py +17 -20
- transformers/models/xglm/configuration_xglm.py +8 -9
- transformers/models/xglm/modeling_xglm.py +55 -60
- transformers/models/xglm/tokenization_xglm.py +11 -3
- transformers/models/xlm/configuration_xlm.py +8 -10
- transformers/models/xlm/modeling_xlm.py +144 -144
- transformers/models/xlm/tokenization_xlm.py +5 -3
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +3 -11
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +194 -195
- transformers/models/xlm_roberta/modular_xlm_roberta.py +53 -50
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +18 -8
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +2 -10
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +93 -94
- transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +70 -67
- transformers/models/xlnet/configuration_xlnet.py +12 -3
- transformers/models/xlnet/modeling_xlnet.py +163 -152
- transformers/models/xlnet/tokenization_xlnet.py +9 -2
- transformers/models/xlstm/configuration_xlstm.py +12 -8
- transformers/models/xlstm/modeling_xlstm.py +65 -62
- transformers/models/xmod/configuration_xmod.py +3 -11
- transformers/models/xmod/modeling_xmod.py +110 -108
- transformers/models/yolos/configuration_yolos.py +1 -0
- transformers/models/yolos/image_processing_yolos.py +62 -60
- transformers/models/yolos/image_processing_yolos_fast.py +45 -42
- transformers/models/yolos/modeling_yolos.py +16 -16
- transformers/models/yolos/modular_yolos.py +19 -17
- transformers/models/yoso/configuration_yoso.py +2 -8
- transformers/models/yoso/modeling_yoso.py +63 -70
- transformers/models/zamba/configuration_zamba.py +8 -5
- transformers/models/zamba/modeling_zamba.py +78 -81
- transformers/models/zamba2/configuration_zamba2.py +50 -44
- transformers/models/zamba2/modeling_zamba2.py +97 -97
- transformers/models/zamba2/modular_zamba2.py +48 -46
- transformers/models/zoedepth/configuration_zoedepth.py +2 -1
- transformers/models/zoedepth/image_processing_zoedepth.py +29 -28
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +24 -21
- transformers/models/zoedepth/modeling_zoedepth.py +18 -26
- transformers/pipelines/__init__.py +114 -57
- transformers/pipelines/any_to_any.py +22 -14
- transformers/pipelines/audio_utils.py +2 -1
- transformers/pipelines/automatic_speech_recognition.py +12 -20
- transformers/pipelines/base.py +27 -15
- transformers/{models/pe_audio/processing_pe_audio.py → pipelines/deprecated/__init__.py} +3 -10
- transformers/pipelines/deprecated/text2text_generation.py +408 -0
- transformers/pipelines/document_question_answering.py +2 -4
- transformers/pipelines/image_text_to_text.py +1 -0
- transformers/pipelines/image_to_text.py +229 -0
- transformers/pipelines/question_answering.py +44 -5
- transformers/pipelines/text_classification.py +14 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/pipelines/token_classification.py +22 -1
- transformers/pipelines/video_classification.py +9 -1
- transformers/pipelines/zero_shot_audio_classification.py +1 -0
- transformers/pipelines/zero_shot_classification.py +6 -0
- transformers/pipelines/zero_shot_image_classification.py +7 -0
- transformers/processing_utils.py +145 -230
- transformers/quantizers/auto.py +4 -2
- transformers/quantizers/base.py +173 -53
- transformers/quantizers/quantizer_aqlm.py +23 -2
- transformers/quantizers/quantizer_auto_round.py +12 -2
- transformers/quantizers/quantizer_awq.py +89 -20
- transformers/quantizers/quantizer_bitnet.py +14 -4
- transformers/quantizers/quantizer_bnb_4bit.py +155 -18
- transformers/quantizers/quantizer_bnb_8bit.py +110 -24
- transformers/quantizers/quantizer_compressed_tensors.py +9 -2
- transformers/quantizers/quantizer_eetq.py +74 -16
- transformers/quantizers/quantizer_fbgemm_fp8.py +138 -38
- transformers/quantizers/quantizer_finegrained_fp8.py +113 -26
- transformers/quantizers/quantizer_fp_quant.py +82 -52
- transformers/quantizers/quantizer_gptq.py +28 -8
- transformers/quantizers/quantizer_higgs.py +60 -42
- transformers/quantizers/quantizer_hqq.py +153 -144
- transformers/quantizers/quantizer_mxfp4.py +194 -14
- transformers/quantizers/quantizer_quanto.py +79 -35
- transformers/quantizers/quantizer_quark.py +18 -36
- transformers/quantizers/quantizer_spqr.py +12 -4
- transformers/quantizers/quantizer_torchao.py +325 -50
- transformers/quantizers/quantizer_vptq.py +27 -4
- transformers/quantizers/quantizers_utils.py +0 -20
- transformers/safetensors_conversion.py +3 -9
- transformers/testing_utils.py +82 -326
- transformers/tokenization_mistral_common.py +903 -568
- transformers/tokenization_utils_base.py +340 -220
- transformers/tokenization_utils_sentencepiece.py +6 -5
- transformers/tokenization_utils_tokenizers.py +113 -226
- transformers/trainer.py +53 -60
- transformers/trainer_callback.py +0 -8
- transformers/trainer_seq2seq.py +1 -5
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +41 -77
- transformers/utils/__init__.py +4 -8
- transformers/utils/attention_visualizer.py +5 -5
- transformers/utils/auto_docstring.py +37 -599
- transformers/utils/doc.py +36 -4
- transformers/utils/dummy_pt_objects.py +42 -0
- transformers/utils/generic.py +28 -111
- transformers/utils/hub.py +15 -5
- transformers/utils/import_utils.py +32 -165
- transformers/utils/kernel_config.py +19 -74
- transformers/utils/loading_report.py +15 -25
- transformers/utils/quantization_config.py +241 -72
- transformers/video_processing_utils.py +39 -41
- transformers/video_utils.py +22 -18
- {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/METADATA +236 -284
- transformers-5.0.0rc0.dist-info/RECORD +1987 -0
- {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/WHEEL +1 -1
- transformers/integrations/moe.py +0 -360
- transformers/integrations/quark.py +0 -53
- transformers/loss/loss_lw_detr.py +0 -356
- transformers/models/ernie4_5_vl_moe/__init__.py +0 -31
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +0 -340
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +0 -455
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +0 -231
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +0 -1936
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +0 -1925
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +0 -249
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +0 -593
- transformers/models/fast_vlm/__init__.py +0 -27
- transformers/models/fast_vlm/configuration_fast_vlm.py +0 -137
- transformers/models/fast_vlm/modeling_fast_vlm.py +0 -432
- transformers/models/fast_vlm/modular_fast_vlm.py +0 -373
- transformers/models/glm4_moe_lite/__init__.py +0 -28
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +0 -233
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +0 -740
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +0 -302
- transformers/models/glm_image/__init__.py +0 -31
- transformers/models/glm_image/configuration_glm_image.py +0 -351
- transformers/models/glm_image/image_processing_glm_image.py +0 -503
- transformers/models/glm_image/image_processing_glm_image_fast.py +0 -294
- transformers/models/glm_image/modeling_glm_image.py +0 -1642
- transformers/models/glm_image/modular_glm_image.py +0 -1531
- transformers/models/glm_image/processing_glm_image.py +0 -217
- transformers/models/glmasr/__init__.py +0 -29
- transformers/models/glmasr/configuration_glmasr.py +0 -196
- transformers/models/glmasr/modeling_glmasr.py +0 -517
- transformers/models/glmasr/modular_glmasr.py +0 -443
- transformers/models/glmasr/processing_glmasr.py +0 -331
- transformers/models/jais2/__init__.py +0 -27
- transformers/models/jais2/configuration_jais2.py +0 -148
- transformers/models/jais2/modeling_jais2.py +0 -484
- transformers/models/jais2/modular_jais2.py +0 -194
- transformers/models/lasr/__init__.py +0 -29
- transformers/models/lasr/configuration_lasr.py +0 -244
- transformers/models/lasr/feature_extraction_lasr.py +0 -275
- transformers/models/lasr/modeling_lasr.py +0 -727
- transformers/models/lasr/modular_lasr.py +0 -574
- transformers/models/lasr/processing_lasr.py +0 -100
- transformers/models/lasr/tokenization_lasr.py +0 -184
- transformers/models/lighton_ocr/__init__.py +0 -28
- transformers/models/lighton_ocr/configuration_lighton_ocr.py +0 -128
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +0 -463
- transformers/models/lighton_ocr/modular_lighton_ocr.py +0 -404
- transformers/models/lighton_ocr/processing_lighton_ocr.py +0 -229
- transformers/models/lw_detr/__init__.py +0 -27
- transformers/models/lw_detr/configuration_lw_detr.py +0 -374
- transformers/models/lw_detr/modeling_lw_detr.py +0 -1702
- transformers/models/lw_detr/modular_lw_detr.py +0 -1615
- transformers/models/minimax_m2/__init__.py +0 -28
- transformers/models/minimax_m2/configuration_minimax_m2.py +0 -188
- transformers/models/minimax_m2/modeling_minimax_m2.py +0 -704
- transformers/models/minimax_m2/modular_minimax_m2.py +0 -346
- transformers/models/paddleocr_vl/__init__.py +0 -31
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +0 -335
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +0 -503
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +0 -209
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +0 -1683
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +0 -1380
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +0 -133
- transformers/models/pe_audio/__init__.py +0 -29
- transformers/models/pe_audio/configuration_pe_audio.py +0 -204
- transformers/models/pe_audio/feature_extraction_pe_audio.py +0 -160
- transformers/models/pe_audio/modeling_pe_audio.py +0 -819
- transformers/models/pe_audio/modular_pe_audio.py +0 -298
- transformers/models/pe_audio_video/__init__.py +0 -28
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +0 -223
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +0 -971
- transformers/models/pe_audio_video/modular_pe_audio_video.py +0 -763
- transformers/models/pe_video/__init__.py +0 -29
- transformers/models/pe_video/configuration_pe_video.py +0 -209
- transformers/models/pe_video/modeling_pe_video.py +0 -647
- transformers/models/pe_video/modular_pe_video.py +0 -231
- transformers/models/pe_video/processing_pe_video.py +0 -10
- transformers/models/pe_video/video_processing_pe_video.py +0 -64
- transformers/models/pixio/__init__.py +0 -29
- transformers/models/pixio/configuration_pixio.py +0 -150
- transformers/models/pixio/modeling_pixio.py +0 -507
- transformers/models/pixio/modular_pixio.py +0 -403
- transformers/models/solar_open/__init__.py +0 -27
- transformers/models/solar_open/configuration_solar_open.py +0 -184
- transformers/models/solar_open/modeling_solar_open.py +0 -642
- transformers/models/solar_open/modular_solar_open.py +0 -224
- transformers/trainer_jit_checkpoint.py +0 -125
- transformers-5.0.0.dist-info/RECORD +0 -2068
- {transformers-5.0.0.dist-info/licenses → transformers-5.0.0rc0.dist-info}/LICENSE +0 -0
- {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# coding=utf-8
|
|
1
2
|
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
|
2
3
|
#
|
|
3
4
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
@@ -15,24 +16,22 @@
|
|
|
15
16
|
|
|
16
17
|
from __future__ import annotations
|
|
17
18
|
|
|
18
|
-
import math
|
|
19
19
|
import os
|
|
20
20
|
import re
|
|
21
21
|
from abc import abstractmethod
|
|
22
22
|
from collections import defaultdict
|
|
23
|
-
from collections.abc import
|
|
23
|
+
from collections.abc import MutableMapping, MutableSet
|
|
24
24
|
from concurrent.futures import Future, ThreadPoolExecutor
|
|
25
25
|
from contextlib import contextmanager
|
|
26
26
|
from copy import deepcopy
|
|
27
27
|
from dataclasses import dataclass, field
|
|
28
|
-
from
|
|
29
|
-
from typing import TYPE_CHECKING, Any
|
|
28
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
30
29
|
|
|
31
30
|
import torch
|
|
32
31
|
|
|
33
|
-
from .integrations.accelerate import
|
|
32
|
+
from .integrations.accelerate import offload_weight
|
|
34
33
|
from .integrations.tensor_parallel import ALL_PARALLEL_STYLES
|
|
35
|
-
from .utils import
|
|
34
|
+
from .utils import is_torch_greater_or_equal, logging
|
|
36
35
|
|
|
37
36
|
|
|
38
37
|
_torch_distributed_available = torch.distributed.is_available()
|
|
@@ -49,40 +48,8 @@ if TYPE_CHECKING:
|
|
|
49
48
|
logger = logging.get_logger(__name__)
|
|
50
49
|
|
|
51
50
|
|
|
52
|
-
def process_target_pattern(pattern: str) -> tuple[str, str | None]:
|
|
53
|
-
"""
|
|
54
|
-
Process a target pattern for reverse mapping (when targets become sources).
|
|
55
|
-
|
|
56
|
-
This handles several edge cases in checkpoint conversion mappings:
|
|
57
|
-
- Removes `^` prefix and `$` suffix (start/end of string anchors)
|
|
58
|
-
- Removes negative lookahead/lookbehind assertions
|
|
59
|
-
- Detects capturing groups and replaces them with `\\1` backreference
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
pattern: The target pattern to process for reverse mapping.
|
|
63
|
-
|
|
64
|
-
Returns:
|
|
65
|
-
A tuple of (processed_pattern, captured_group) where captured_group is
|
|
66
|
-
the original capturing group found (e.g., "(encoder|decoder)") or None.
|
|
67
|
-
"""
|
|
68
|
-
# Some mapping contains `^` to notify start of string when matching -> remove it during reverse mapping
|
|
69
|
-
pattern = pattern.removeprefix("^")
|
|
70
|
-
# Some mapping contains `$` to notify end of string when matching -> remove it during reverse mapping
|
|
71
|
-
pattern = pattern.removesuffix("$")
|
|
72
|
-
# Remove negative lookahead/behind if any. This is ugly but needed for reverse mapping of
|
|
73
|
-
# Qwen2.5, Sam3, Ernie4.5 VL MoE!
|
|
74
|
-
pattern = re.sub(r"\(\?.+\)", "", pattern)
|
|
75
|
-
# Allow capturing groups in patterns, i.e. to add/remove a prefix to all keys (e.g. timm_wrapper, sam3)
|
|
76
|
-
capturing_group_match = re.search(r"\(.+?\)", pattern)
|
|
77
|
-
captured_group = None
|
|
78
|
-
if capturing_group_match:
|
|
79
|
-
captured_group = capturing_group_match.group(0)
|
|
80
|
-
pattern = pattern.replace(captured_group, r"\1", 1)
|
|
81
|
-
return pattern, captured_group
|
|
82
|
-
|
|
83
|
-
|
|
84
51
|
def build_glob_alternation(
|
|
85
|
-
globs: list[WeightRenaming
|
|
52
|
+
globs: list[Union[WeightRenaming, WeightConverter, str]],
|
|
86
53
|
) -> tuple[re.Pattern, dict[str, str], dict[str, str]]:
|
|
87
54
|
"""
|
|
88
55
|
Build a single alternation regex with one named group per glob.
|
|
@@ -145,12 +112,12 @@ class Chunk(ConversionOps):
|
|
|
145
112
|
) -> dict[str, torch.Tensor]:
|
|
146
113
|
tensors = next(iter(input_dict.values()))
|
|
147
114
|
tensor = tensors[0] if isinstance(tensors, list) else tensors
|
|
148
|
-
targets = self.
|
|
115
|
+
targets = self.get_target_pattern(input_dict, target_patterns)
|
|
149
116
|
sizes = len(targets)
|
|
150
117
|
chunks = torch.chunk(tensor, sizes, dim=self.dim)
|
|
151
118
|
return dict(zip(targets, chunks))
|
|
152
119
|
|
|
153
|
-
def
|
|
120
|
+
def get_target_pattern(self, input_dict: dict, target_patterns: list[str]) -> list[str]:
|
|
154
121
|
# Here we always return the target patterns
|
|
155
122
|
if len(input_dict) > 1 or len(target_patterns) == 1:
|
|
156
123
|
raise ValueError("Undefined Operation encountered!")
|
|
@@ -177,7 +144,7 @@ class Concatenate(ConversionOps):
|
|
|
177
144
|
) -> dict[str, torch.Tensor]:
|
|
178
145
|
target_pattern = self.get_target_pattern(target_patterns)
|
|
179
146
|
all_tensors = []
|
|
180
|
-
# Very important to keep the relative order of the source
|
|
147
|
+
# Very important to keep the relative order of the source patterms here, so we iterate over them not the
|
|
181
148
|
# input directly as it's unordered!
|
|
182
149
|
for source_pattern in source_patterns:
|
|
183
150
|
tensors = input_dict[source_pattern]
|
|
@@ -277,44 +244,6 @@ class SplitModulelist(ConversionOps):
|
|
|
277
244
|
return MergeModulelist(self.dim)
|
|
278
245
|
|
|
279
246
|
|
|
280
|
-
class Transpose(ConversionOps):
|
|
281
|
-
"""
|
|
282
|
-
Transposes the given tensor along dim0 and dim1.
|
|
283
|
-
"""
|
|
284
|
-
|
|
285
|
-
def __init__(self, dim0: int = 0, dim1: int = 1):
|
|
286
|
-
self.dim0 = dim0
|
|
287
|
-
self.dim1 = dim1
|
|
288
|
-
|
|
289
|
-
@torch.no_grad
|
|
290
|
-
def convert(
|
|
291
|
-
self, input_dict: dict[str, torch.Tensor], source_patterns: list[str], target_patterns: list[str], **kwargs
|
|
292
|
-
) -> dict[str, torch.Tensor]:
|
|
293
|
-
target_pattern = self.get_target_pattern(input_dict, source_patterns, target_patterns)
|
|
294
|
-
tensors = next(iter(input_dict.values()))
|
|
295
|
-
tensor = tensors[0] if isinstance(tensors, list) else tensors
|
|
296
|
-
return {target_pattern: torch.transpose(tensor, dim0=self.dim0, dim1=self.dim1).contiguous()}
|
|
297
|
-
|
|
298
|
-
def get_target_pattern(
|
|
299
|
-
self, input_dict: dict[str, torch.Tensor], source_patterns: list[str], target_patterns: list[str]
|
|
300
|
-
) -> str:
|
|
301
|
-
if len(input_dict) != 1:
|
|
302
|
-
raise ValueError("Undefined Operation encountered!")
|
|
303
|
-
# Here it's the first operation of a chain, so return the source
|
|
304
|
-
if len(target_patterns) > 1:
|
|
305
|
-
if len(source_patterns) == 1:
|
|
306
|
-
return source_patterns[0]
|
|
307
|
-
else:
|
|
308
|
-
raise ValueError("Undefined Operation encountered!")
|
|
309
|
-
# Here it's the only operation, or the last operation in a chain, so we return the target
|
|
310
|
-
else:
|
|
311
|
-
return target_patterns[0]
|
|
312
|
-
|
|
313
|
-
@property
|
|
314
|
-
def reverse_op(self) -> ConversionOps:
|
|
315
|
-
return Transpose(dim0=self.dim1, dim1=self.dim0)
|
|
316
|
-
|
|
317
|
-
|
|
318
247
|
class PermuteForRope(ConversionOps):
|
|
319
248
|
"""
|
|
320
249
|
Applies the permutation required to convert complex RoPE weights to the split sin/cos format.
|
|
@@ -349,217 +278,18 @@ class PermuteForRope(ConversionOps):
|
|
|
349
278
|
return output
|
|
350
279
|
|
|
351
280
|
|
|
352
|
-
class ErnieFuseAndSplitTextVisionExperts(ConversionOps):
|
|
353
|
-
r"""
|
|
354
|
-
Special operation that splits a module list over all keys and fuses over the number of original modules.
|
|
355
|
-
|
|
356
|
-
Example with 2 original modules "Gate" and "Up" with 2 target keys "Text" and "Vision":
|
|
357
|
-
|
|
358
|
-
ModuleList 1 ModuleList 2
|
|
359
|
-
[ Gate ] [ Up ]
|
|
360
|
-
| | | |
|
|
361
|
-
[Gate_Text] [Gate_Vision] [Up_Text] [Up_Vision]
|
|
362
|
-
\ \ / /
|
|
363
|
-
\ \ / /
|
|
364
|
-
\ / \ /
|
|
365
|
-
\ / \ /
|
|
366
|
-
[GateUp_Text] [GateUp_Vision]
|
|
367
|
-
|
|
368
|
-
The splits are equal and are defined by the amount of target keys.
|
|
369
|
-
The final fusions are defined by the amount of original module lists.
|
|
370
|
-
"""
|
|
371
|
-
|
|
372
|
-
def __init__(self, stack_dim: int = 0, concat_dim: int = 1):
|
|
373
|
-
self.stack_dim = stack_dim
|
|
374
|
-
self.concat_dim = concat_dim
|
|
375
|
-
|
|
376
|
-
def split_list_into_chunks(self, tensor_list: list[torch.Tensor], chunks: int = 2):
|
|
377
|
-
split_size = math.ceil(len(tensor_list) / chunks) # best effort split size
|
|
378
|
-
return [tensor_list[i * split_size : (i + 1) * split_size] for i in range(chunks)]
|
|
379
|
-
|
|
380
|
-
@torch.no_grad()
|
|
381
|
-
def convert(
|
|
382
|
-
self,
|
|
383
|
-
input_dict: dict[str, list[torch.Tensor]],
|
|
384
|
-
source_patterns: list[str],
|
|
385
|
-
target_patterns: list[str],
|
|
386
|
-
config,
|
|
387
|
-
**kwargs,
|
|
388
|
-
) -> dict[str, list[torch.Tensor]]:
|
|
389
|
-
valid_keys = input_dict.keys()
|
|
390
|
-
split_and_fused = defaultdict(list)
|
|
391
|
-
for key in source_patterns:
|
|
392
|
-
if key not in valid_keys:
|
|
393
|
-
raise ValueError(
|
|
394
|
-
f"Expected pattern {key} in collected tensors but only found tensors for: {valid_keys}"
|
|
395
|
-
)
|
|
396
|
-
|
|
397
|
-
tensors = input_dict.get(key, [])
|
|
398
|
-
split_tensor_lists = self.split_list_into_chunks(tensors, chunks=len(target_patterns))
|
|
399
|
-
stacked_tensors = (torch.stack(tensor_group, dim=self.stack_dim) for tensor_group in split_tensor_lists)
|
|
400
|
-
for idx, tensor_group in enumerate(stacked_tensors):
|
|
401
|
-
split_and_fused[target_patterns[idx]].append(tensor_group)
|
|
402
|
-
|
|
403
|
-
for k, v in split_and_fused.items():
|
|
404
|
-
split_and_fused[k] = torch.cat(v, dim=self.concat_dim)
|
|
405
|
-
|
|
406
|
-
return split_and_fused
|
|
407
|
-
|
|
408
|
-
@property
|
|
409
|
-
def reverse_op(self) -> ConversionOps:
|
|
410
|
-
return ErnieSplitAndDecoupleTextVisionExperts(stack_dim=self.stack_dim, concat_dim=self.concat_dim)
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
class ErnieSplitAndDecoupleTextVisionExperts(ConversionOps):
|
|
414
|
-
r"""
|
|
415
|
-
Special operation that splits a fused module list over all original modules and
|
|
416
|
-
then decouples them into a mixed module list each over all keys.
|
|
417
|
-
|
|
418
|
-
Example with 2 original modules "Gate" and "Up" with 2 target keys "Text" and "Vision":
|
|
419
|
-
|
|
420
|
-
[GateUp_Text] [GateUp_Vision]
|
|
421
|
-
/ \ / \
|
|
422
|
-
/ \ / \
|
|
423
|
-
/ / \ \
|
|
424
|
-
/ / \ \
|
|
425
|
-
[Gate_Text] [Gate_Vision] [Up_Text] [Up_Vision]
|
|
426
|
-
| | | |
|
|
427
|
-
[ Gate ] [ Up ]
|
|
428
|
-
ModuleList 1 ModuleList 2
|
|
429
|
-
|
|
430
|
-
The splits are equal and are defined by the amount of original module lists.
|
|
431
|
-
The final decoupled module lists are defined by the amount of keys.
|
|
432
|
-
"""
|
|
433
|
-
|
|
434
|
-
def __init__(self, stack_dim: int = 0, concat_dim: int = 1):
|
|
435
|
-
self.stack_dim = stack_dim
|
|
436
|
-
self.concat_dim = concat_dim
|
|
437
|
-
|
|
438
|
-
@torch.no_grad()
|
|
439
|
-
def convert(
|
|
440
|
-
self,
|
|
441
|
-
input_dict: dict[str, list[torch.Tensor]],
|
|
442
|
-
source_patterns: list[str],
|
|
443
|
-
target_patterns: list[str],
|
|
444
|
-
config,
|
|
445
|
-
**kwargs,
|
|
446
|
-
) -> dict[str, list[torch.Tensor]]:
|
|
447
|
-
fused_modules = len(target_patterns)
|
|
448
|
-
valid_keys = input_dict.keys()
|
|
449
|
-
split_tensors = []
|
|
450
|
-
for key in source_patterns:
|
|
451
|
-
if key not in valid_keys:
|
|
452
|
-
raise ValueError(
|
|
453
|
-
f"Expected pattern {key} in collected tensors but only found tensors for: {valid_keys}"
|
|
454
|
-
)
|
|
455
|
-
|
|
456
|
-
# Assuming that we get single sized lists here to index with 0
|
|
457
|
-
split_tensors.append(input_dict[key][0].chunk(fused_modules, dim=self.concat_dim))
|
|
458
|
-
|
|
459
|
-
decoupled = {}
|
|
460
|
-
for idx, key in enumerate(target_patterns):
|
|
461
|
-
tensor_groups = [
|
|
462
|
-
list(torch.unbind(tensor_group[idx], dim=self.stack_dim)) for tensor_group in split_tensors
|
|
463
|
-
]
|
|
464
|
-
tensor_list = list(chain.from_iterable(tensor_groups))
|
|
465
|
-
targets = [key.replace("*", f"{i}") for i in range(len(tensor_list))]
|
|
466
|
-
decoupled |= dict(zip(targets, tensor_list))
|
|
467
|
-
|
|
468
|
-
return decoupled
|
|
469
|
-
|
|
470
|
-
@property
|
|
471
|
-
def reverse_op(self) -> ConversionOps:
|
|
472
|
-
return ErnieFuseAndSplitTextVisionExperts(stack_dim=self.stack_dim, concat_dim=self.concat_dim)
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
class Force16BytesAlignment(ConversionOps):
|
|
476
|
-
"""
|
|
477
|
-
Ensures that the given tensor is 16-bytes aligned in memory and clones it if not.
|
|
478
|
-
This garantees 16-bytes alignmenet for kernels / implementations that use TMA or SIMD instructions like torch._grouped_mm.
|
|
479
|
-
"""
|
|
480
|
-
|
|
481
|
-
@torch.no_grad()
|
|
482
|
-
def convert(
|
|
483
|
-
self, input_dict: dict[str, torch.Tensor], source_patterns: list[str], target_patterns: list[str], **kwargs
|
|
484
|
-
) -> dict[str, torch.Tensor]:
|
|
485
|
-
target_pattern = self.get_target_pattern(input_dict, source_patterns, target_patterns)
|
|
486
|
-
tensors = next(iter(input_dict.values()))
|
|
487
|
-
tensor = tensors[0] if isinstance(tensors, list) else tensors
|
|
488
|
-
tensor = tensor.clone() if tensor.data_ptr() % 16 != 0 else tensor
|
|
489
|
-
return {target_pattern: tensor}
|
|
490
|
-
|
|
491
|
-
def get_target_pattern(
|
|
492
|
-
self, input_dict: dict[str, torch.Tensor], source_patterns: list[str], target_patterns: list[str]
|
|
493
|
-
) -> str:
|
|
494
|
-
if len(input_dict) != 1:
|
|
495
|
-
raise ValueError("Undefined Operation encountered!")
|
|
496
|
-
# Here it's the first operation of a chain, so return the source
|
|
497
|
-
if len(target_patterns) > 1:
|
|
498
|
-
if len(source_patterns) == 1:
|
|
499
|
-
return source_patterns[0]
|
|
500
|
-
else:
|
|
501
|
-
raise ValueError("Undefined Operation encountered!")
|
|
502
|
-
# Here it's the only operation, or the last operation in a chain, so we return the target
|
|
503
|
-
else:
|
|
504
|
-
return target_patterns[0]
|
|
505
|
-
|
|
506
|
-
@property
|
|
507
|
-
def reverse_op(self) -> ConversionOps:
|
|
508
|
-
return Force16BytesAlignment()
|
|
509
|
-
|
|
510
|
-
|
|
511
281
|
@dataclass(slots=True)
|
|
512
282
|
class WeightTransform:
|
|
513
|
-
source_patterns: str
|
|
514
|
-
target_patterns: str
|
|
283
|
+
source_patterns: Union[str, list[str]] = field(init=True)
|
|
284
|
+
target_patterns: Union[str, list[str]] = field(init=True)
|
|
515
285
|
compiled_sources: re.Pattern = field(init=False)
|
|
516
286
|
|
|
517
|
-
distributed_operation: TensorParallelLayer
|
|
518
|
-
quantization_operation: ConversionOps
|
|
287
|
+
distributed_operation: Optional[TensorParallelLayer] = None
|
|
288
|
+
quantization_operation: Optional[ConversionOps] = None
|
|
519
289
|
|
|
520
290
|
collected_tensors: dict[str, list[Future]] = field(default_factory=lambda: defaultdict(list), init=False)
|
|
521
291
|
layer_targets: dict[str, set[str]] = field(default_factory=lambda: defaultdict(set), init=False)
|
|
522
292
|
|
|
523
|
-
def __setattr__(self, name: str, value: Any):
|
|
524
|
-
if name == "source_patterns":
|
|
525
|
-
if isinstance(value, str):
|
|
526
|
-
value = [value]
|
|
527
|
-
normalized = []
|
|
528
|
-
for pattern in value:
|
|
529
|
-
if r"\1" in pattern:
|
|
530
|
-
pattern = pattern.replace(r"\1", r"(.+)")
|
|
531
|
-
normalized.append(pattern)
|
|
532
|
-
object.__setattr__(self, name, normalized)
|
|
533
|
-
self._rebuild_compiled_sources()
|
|
534
|
-
return
|
|
535
|
-
if name == "target_patterns":
|
|
536
|
-
if isinstance(value, str):
|
|
537
|
-
value = [value]
|
|
538
|
-
normalized = []
|
|
539
|
-
for pattern in value:
|
|
540
|
-
# Some mapping contains `^` to notify start of string when matching -> remove it during reverse mapping
|
|
541
|
-
pattern = pattern.removeprefix("^")
|
|
542
|
-
# Some mapping contains `$` to notify end of string when matching -> remove it during reverse mapping
|
|
543
|
-
pattern = pattern.removesuffix("$")
|
|
544
|
-
# Remove negative lookahead/behind if any. This is ugly but needed for reverse mapping of
|
|
545
|
-
# Qwen2.5, Sam3, Ernie4.5 VL MoE!
|
|
546
|
-
pattern = re.sub(r"\(\?.+\)", "", pattern)
|
|
547
|
-
# Allow capturing groups in patterns, i.e. to add/remove a prefix to all keys (e.g. timm_wrapper, sam3)
|
|
548
|
-
if r"(.+)" in pattern:
|
|
549
|
-
pattern = pattern.replace(r"(.+)", r"\1")
|
|
550
|
-
normalized.append(pattern)
|
|
551
|
-
object.__setattr__(self, name, normalized)
|
|
552
|
-
return
|
|
553
|
-
object.__setattr__(self, name, value)
|
|
554
|
-
|
|
555
|
-
def _rebuild_compiled_sources(self):
|
|
556
|
-
branches = []
|
|
557
|
-
for i, source_pattern in enumerate(self.source_patterns):
|
|
558
|
-
group_name = f"g{i}"
|
|
559
|
-
pattern = source_pattern.replace(".*.", r"\..*\.")
|
|
560
|
-
branches.append(f"(?P<{group_name}>{pattern})")
|
|
561
|
-
object.__setattr__(self, "compiled_sources", re.compile("|".join(branches)))
|
|
562
|
-
|
|
563
293
|
def __post_init__(self):
|
|
564
294
|
if isinstance(self.source_patterns, str):
|
|
565
295
|
self.source_patterns = [self.source_patterns]
|
|
@@ -569,45 +299,39 @@ class WeightTransform:
|
|
|
569
299
|
# Due to how our `_checkpoint_conversion_mapping` mappings are written, we need a few exceptions here
|
|
570
300
|
# when instantiating the reverse mapping (i.e. the targets become sources, and sources become targets)
|
|
571
301
|
# The issues lie in the sources usually, so here we need to check the targets for the reversed mapping
|
|
572
|
-
|
|
573
|
-
# Process target_patterns: detect capturing groups and replace with \1
|
|
574
|
-
# Store the original capturing group patterns for reverse mapping
|
|
575
|
-
target_capturing_groups: list[str] = []
|
|
576
302
|
for i, pattern in enumerate(self.target_patterns):
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
raise ValueError(
|
|
586
|
-
f"Multiple different capturing groups found in target_patterns: {unique_capturing_groups}. "
|
|
587
|
-
f"All target patterns must use the same capturing group pattern."
|
|
588
|
-
)
|
|
589
|
-
unique_capturing_group = unique_capturing_groups.pop() if unique_capturing_groups else None
|
|
303
|
+
# Some mapping contains `^` to notify start of string when matching -> remove it during reverse mapping
|
|
304
|
+
pattern = pattern.removeprefix("^")
|
|
305
|
+
# Remove negative lookahead if any. This is ugly but needed for reverse mapping of Qwen2.5 and Sam3!
|
|
306
|
+
pattern = re.sub(r"\(\?!.+\)", "", pattern)
|
|
307
|
+
# Allow capturing groups in patterns, i.e. to add/remove a prefix to all keys (e.g. timm_wrapper, sam3)
|
|
308
|
+
if r"(.+)" in pattern:
|
|
309
|
+
pattern = pattern.replace(r"(.+)", r"\1")
|
|
310
|
+
self.target_patterns[i] = pattern
|
|
590
311
|
|
|
591
312
|
# We also need to check capturing groups in the sources during reverse mapping (e.g. timm_wrapper, sam3)
|
|
592
313
|
for i, pattern in enumerate(self.source_patterns):
|
|
593
314
|
if r"\1" in pattern:
|
|
594
|
-
|
|
595
|
-
raise ValueError(
|
|
596
|
-
f"Source pattern '{pattern}' contains \\1 backreference, but no capturing groups "
|
|
597
|
-
f"found in target_patterns."
|
|
598
|
-
)
|
|
599
|
-
# Use the unique capturing group from target_patterns for all sources
|
|
600
|
-
pattern = pattern.replace(r"\1", unique_capturing_group, 1)
|
|
315
|
+
pattern = pattern.replace(r"\1", r"(.+)")
|
|
601
316
|
self.source_patterns[i] = pattern
|
|
602
317
|
|
|
603
318
|
# Construct the regex we will use to rename keys from the sources to the targets
|
|
604
|
-
|
|
319
|
+
branches = []
|
|
320
|
+
for i, source_pattern in enumerate(self.source_patterns):
|
|
321
|
+
group_name = f"g{i}"
|
|
322
|
+
pattern = source_pattern.replace(".*.", r"\..*\.")
|
|
323
|
+
branches.append(f"(?P<{group_name}>{pattern})")
|
|
324
|
+
self.compiled_sources = re.compile("|".join(branches))
|
|
605
325
|
|
|
606
326
|
def add_tensor(self, target_key: str, source_key: str, source_pattern: str, future: Future):
|
|
607
327
|
self.collected_tensors[source_pattern].append(future)
|
|
608
328
|
self.layer_targets[target_key].add(source_key)
|
|
609
329
|
|
|
610
|
-
def
|
|
330
|
+
def reset(self) -> None:
|
|
331
|
+
"""Clean-up the collected tensors to make sure we don't keep references to past tensors in memory."""
|
|
332
|
+
self.collected_tensors = defaultdict(list)
|
|
333
|
+
|
|
334
|
+
def rename_source_key(self, source_key: str) -> tuple[str, str | None]:
|
|
611
335
|
"""
|
|
612
336
|
Return a tuple (renamed_key, source_pattern_producing_the_match).
|
|
613
337
|
Try renaming `source_key` according to the source and target patterns of the current WeightTransform.
|
|
@@ -618,19 +342,19 @@ class WeightTransform:
|
|
|
618
342
|
match_object = self.compiled_sources.search(source_key)
|
|
619
343
|
if match_object is None:
|
|
620
344
|
return source_key, None
|
|
621
|
-
|
|
622
345
|
# Find the source that produced the match (it's the first group that matched, as the search stops after first branch match)
|
|
623
346
|
matching_group_name = next(name for name, val in match_object.groupdict().items() if val is not None)
|
|
624
347
|
source_pattern_that_matched = self.source_patterns[int(matching_group_name[1:])]
|
|
625
348
|
# If we matched, we always replace with the first target pattern, in case we have several (one to many transform)
|
|
626
349
|
replacement = self.target_patterns[0]
|
|
627
|
-
# Allow capturing groups in patterns, i.e. to add a prefix to all keys (e.g. timm_wrapper, sam3)
|
|
350
|
+
# # Allow capturing groups in patterns, i.e. to add a prefix to all keys (e.g. timm_wrapper, sam3)
|
|
628
351
|
if r"\1" in replacement:
|
|
629
352
|
# The index of the internal group we need to replace is the index of the matched named group as it comes
|
|
630
353
|
# inside that matched named group
|
|
631
354
|
replaced_group_idx = self.compiled_sources.groupindex[matching_group_name] + 1
|
|
632
355
|
replacement = replacement.replace(r"\1", match_object.group(replaced_group_idx))
|
|
633
356
|
renamed_key = source_key.replace(match_object.group(0), replacement)
|
|
357
|
+
|
|
634
358
|
return renamed_key, source_pattern_that_matched
|
|
635
359
|
|
|
636
360
|
def reverse_transform(self) -> WeightTransform:
|
|
@@ -651,32 +375,6 @@ class WeightTransform:
|
|
|
651
375
|
|
|
652
376
|
return reverse_transform
|
|
653
377
|
|
|
654
|
-
def materialize_tensors(self) -> dict[str, list[torch.Tensor]]:
|
|
655
|
-
"""
|
|
656
|
-
Materialize all the tensors that were saved in `self.collected_tensors`. This function removes them from the
|
|
657
|
-
internal attribute to avoid keeping them in memory during the different `self.convert` operations, and return
|
|
658
|
-
a new dictionary (otherwise we use more memory than needed during loading).
|
|
659
|
-
|
|
660
|
-
We basically have 3 cases here:
|
|
661
|
-
- async loading (default): the tensors are Future instances that we need to wait for
|
|
662
|
-
- sync loading: the tensors are Callable, we need to call the Callable to actually load them from disk
|
|
663
|
-
- saving: the tensors are already torch.Tensor instances (the existing model weights)
|
|
664
|
-
"""
|
|
665
|
-
collected_tensors = {}
|
|
666
|
-
for key in set(self.collected_tensors.keys()):
|
|
667
|
-
# Remove from internal attribute
|
|
668
|
-
tensors = self.collected_tensors.pop(key)
|
|
669
|
-
# Async loading
|
|
670
|
-
if isinstance(tensors[0], Future):
|
|
671
|
-
tensors = [future.result() for future in tensors]
|
|
672
|
-
# Sync loading
|
|
673
|
-
elif callable(tensors[0]):
|
|
674
|
-
tensors = [func() for func in tensors]
|
|
675
|
-
# Add them to the new dictionary
|
|
676
|
-
collected_tensors[key] = tensors
|
|
677
|
-
|
|
678
|
-
return collected_tensors
|
|
679
|
-
|
|
680
378
|
|
|
681
379
|
@dataclass(slots=True)
|
|
682
380
|
class WeightRenaming(WeightTransform):
|
|
@@ -688,22 +386,22 @@ class WeightRenaming(WeightTransform):
|
|
|
688
386
|
model=None,
|
|
689
387
|
config=None,
|
|
690
388
|
hf_quantizer=None,
|
|
691
|
-
missing_keys: MutableSet[str]
|
|
692
|
-
|
|
389
|
+
missing_keys: Optional[MutableSet[str]] = None,
|
|
390
|
+
misc: Optional[MutableMapping[str, str]] = None,
|
|
693
391
|
):
|
|
694
|
-
# Collect the
|
|
695
|
-
|
|
696
|
-
|
|
392
|
+
# Collect the tensor if using threading
|
|
393
|
+
for pattern, futures in self.collected_tensors.items():
|
|
394
|
+
self.collected_tensors[pattern] = (
|
|
395
|
+
futures if isinstance(futures[0], torch.Tensor) else [future.result() for future in futures]
|
|
396
|
+
)
|
|
697
397
|
|
|
698
398
|
# Perform renaming op (for a simple WeightRenaming, `self.source_patterns` and `self.target_patterns` can
|
|
699
399
|
# only be of length 1, and are actually the full key names - we also have only 1 single related tensor)
|
|
700
400
|
target_key = self.target_patterns[0]
|
|
701
|
-
collected_tensors = {target_key: collected_tensors[self.source_patterns[0]]}
|
|
401
|
+
collected_tensors = {target_key: self.collected_tensors[self.source_patterns[0]]}
|
|
702
402
|
|
|
703
403
|
if hf_quantizer is not None and self.quantization_operation is not None:
|
|
704
|
-
with
|
|
705
|
-
layer_name, conversion_errors, (len(collected_tensors), layer_name), self.quantization_operation
|
|
706
|
-
):
|
|
404
|
+
with log_to_misc(layer_name, misc, (self.collected_tensors, layer_name), self.quantization_operation):
|
|
707
405
|
collected_tensors = self.quantization_operation.convert(
|
|
708
406
|
collected_tensors,
|
|
709
407
|
source_patterns=self.source_patterns,
|
|
@@ -714,14 +412,7 @@ class WeightRenaming(WeightTransform):
|
|
|
714
412
|
missing_keys=missing_keys,
|
|
715
413
|
)
|
|
716
414
|
|
|
717
|
-
return collected_tensors,
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
# List of classes that are known to be able to use m:n
|
|
721
|
-
_INTERNAL_MANY_TO_MANY_CONVERSIONS = (
|
|
722
|
-
ErnieFuseAndSplitTextVisionExperts,
|
|
723
|
-
ErnieSplitAndDecoupleTextVisionExperts,
|
|
724
|
-
)
|
|
415
|
+
return collected_tensors, misc
|
|
725
416
|
|
|
726
417
|
|
|
727
418
|
@dataclass(slots=True)
|
|
@@ -729,12 +420,11 @@ class WeightConverter(WeightTransform):
|
|
|
729
420
|
operations: list[ConversionOps] = field(default_factory=list, repr=False)
|
|
730
421
|
|
|
731
422
|
def __post_init__(self):
|
|
423
|
+
WeightTransform.__post_init__(self)
|
|
732
424
|
if bool(len(self.source_patterns) - 1) + bool(len(self.target_patterns) - 1) >= 2:
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
f"source keys={self.source_patterns}, target_patterns={self.target_patterns} but you can only have one to many, one to one or many to one."
|
|
737
|
-
)
|
|
425
|
+
raise ValueError(
|
|
426
|
+
f"source keys={self.source_patterns}, target_patterns={self.target_patterns} but you can only have one to many, one to one or many to one."
|
|
427
|
+
)
|
|
738
428
|
if not self.operations:
|
|
739
429
|
raise ValueError("WeightConverter requires at least one operation.")
|
|
740
430
|
|
|
@@ -744,15 +434,18 @@ class WeightConverter(WeightTransform):
|
|
|
744
434
|
model=None,
|
|
745
435
|
config=None,
|
|
746
436
|
hf_quantizer=None,
|
|
747
|
-
missing_keys: MutableSet[str]
|
|
748
|
-
|
|
437
|
+
missing_keys: Optional[MutableSet[str]] = None,
|
|
438
|
+
misc: Optional[MutableMapping[str, str]] = None,
|
|
749
439
|
):
|
|
750
|
-
# Collect
|
|
751
|
-
|
|
752
|
-
|
|
440
|
+
# Collect all tensors if using threading
|
|
441
|
+
for pattern, futures in self.collected_tensors.items():
|
|
442
|
+
self.collected_tensors[pattern] = (
|
|
443
|
+
futures if isinstance(futures[0], torch.Tensor) else [future.result() for future in futures]
|
|
444
|
+
)
|
|
753
445
|
|
|
446
|
+
collected_tensors = self.collected_tensors
|
|
754
447
|
for op in self.operations:
|
|
755
|
-
with
|
|
448
|
+
with log_to_misc(layer_name, misc, (collected_tensors, layer_name), op):
|
|
756
449
|
collected_tensors = op.convert(
|
|
757
450
|
collected_tensors,
|
|
758
451
|
source_patterns=self.source_patterns,
|
|
@@ -769,19 +462,11 @@ class WeightConverter(WeightTransform):
|
|
|
769
462
|
full_name = layer_name
|
|
770
463
|
if ".*." in layer_name:
|
|
771
464
|
full_name = layer_name.replace(".*.", ".0.")
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
# Rename the tensors
|
|
776
|
-
collected_tensors = {prefix + k + suffix: v for k, v in collected_tensors.items()}
|
|
777
|
-
# some quantizers need to already rename in `convert` as they cannot only rely on prefix and suffix
|
|
778
|
-
except StopIteration:
|
|
779
|
-
pass
|
|
780
|
-
|
|
465
|
+
prefix, _, suffix = next(full_name.partition(k) for k in collected_tensors.keys() if k in full_name)
|
|
466
|
+
# Rename the tensors
|
|
467
|
+
collected_tensors = {prefix + k + suffix: v for k, v in collected_tensors.items()}
|
|
781
468
|
if hf_quantizer is not None and self.quantization_operation is not None:
|
|
782
|
-
with
|
|
783
|
-
layer_name, conversion_errors, (len(collected_tensors), layer_name), self.quantization_operation
|
|
784
|
-
):
|
|
469
|
+
with log_to_misc(layer_name, misc, (collected_tensors, layer_name), self.quantization_operation):
|
|
785
470
|
collected_tensors = self.quantization_operation.convert(
|
|
786
471
|
collected_tensors,
|
|
787
472
|
source_patterns=self.source_patterns,
|
|
@@ -791,7 +476,7 @@ class WeightConverter(WeightTransform):
|
|
|
791
476
|
model=model,
|
|
792
477
|
missing_keys=missing_keys,
|
|
793
478
|
)
|
|
794
|
-
return collected_tensors,
|
|
479
|
+
return collected_tensors, misc
|
|
795
480
|
|
|
796
481
|
|
|
797
482
|
# For I/O bound operations (i.e. here reading files), it is better to have fewer threads, e.g. 4 is a good default.
|
|
@@ -800,46 +485,25 @@ class WeightConverter(WeightTransform):
|
|
|
800
485
|
GLOBAL_WORKERS = min(4, os.cpu_count() or 4)
|
|
801
486
|
|
|
802
487
|
|
|
803
|
-
def _materialize_copy(tensor
|
|
804
|
-
# This slicing is what actually loads the tensor from the safetensors slice object
|
|
488
|
+
def _materialize_copy(tensor, device=None, dtype=None):
|
|
805
489
|
tensor = tensor[...]
|
|
806
490
|
if dtype is not None or device is not None:
|
|
807
491
|
tensor = tensor.to(device=device, dtype=dtype)
|
|
808
492
|
return tensor
|
|
809
493
|
|
|
810
494
|
|
|
811
|
-
def spawn_materialize(
|
|
812
|
-
thread_pool: ThreadPoolExecutor | None, tensor: torch.Tensor, device=None, dtype=None
|
|
813
|
-
) -> Future | Callable:
|
|
814
|
-
"""Materialize a tensor from file asynchronously if `thread_pool` is provided, or return a Callable that will
|
|
815
|
-
load the tensor synchronously when called."""
|
|
816
|
-
|
|
495
|
+
def spawn_materialize(thread_pool, tensor, device=None, dtype=None) -> Future:
|
|
817
496
|
def _job():
|
|
818
497
|
return _materialize_copy(tensor, device, dtype)
|
|
819
498
|
|
|
820
|
-
|
|
821
|
-
return thread_pool.submit(_job)
|
|
822
|
-
else:
|
|
823
|
-
# Return the Callable here, not the Tensor itself, so we actually delay loading to avoid saturating cpu
|
|
824
|
-
# memory during Conversion
|
|
825
|
-
return _job
|
|
826
|
-
|
|
499
|
+
return thread_pool.submit(_job)
|
|
827
500
|
|
|
828
|
-
def spawn_tp_materialize(
|
|
829
|
-
thread_pool: ThreadPoolExecutor | None, tensor: torch.Tensor, sharding_method, tensor_idx, device=None, dtype=None
|
|
830
|
-
) -> Future | Callable:
|
|
831
|
-
"""Materialize and shard a tensor (according to the TP-plan) from file asynchronously if `thread_pool` is provided, or
|
|
832
|
-
return a Callable that will load the tensor synchronously when called."""
|
|
833
501
|
|
|
502
|
+
def spawn_tp_materialize(thread_pool, tensor, sharding_method, tensor_idx, dtype=None) -> Future:
|
|
834
503
|
def _job():
|
|
835
|
-
return sharding_method.shard_tensor(tensor,
|
|
504
|
+
return sharding_method.shard_tensor(tensor, param_casting_dtype=dtype, tensor_idx=tensor_idx)[0]
|
|
836
505
|
|
|
837
|
-
|
|
838
|
-
return thread_pool.submit(_job)
|
|
839
|
-
else:
|
|
840
|
-
# Return the Callable here, not the Tensor itself, so we actually delay loading to avoid saturating cpu
|
|
841
|
-
# memory during Conversion
|
|
842
|
-
return _job
|
|
506
|
+
return thread_pool.submit(_job)
|
|
843
507
|
|
|
844
508
|
|
|
845
509
|
def dot_natural_key(s: str):
|
|
@@ -852,22 +516,18 @@ def dot_natural_key(s: str):
|
|
|
852
516
|
|
|
853
517
|
|
|
854
518
|
@contextmanager
|
|
855
|
-
def
|
|
519
|
+
def log_to_misc(
|
|
856
520
|
first_target_key: str,
|
|
857
|
-
|
|
521
|
+
misc: MutableMapping[str, str],
|
|
858
522
|
extras: Any = None,
|
|
859
|
-
op: list[ConversionOps]
|
|
523
|
+
op: Union[list[ConversionOps], ConversionOps, None] = None,
|
|
860
524
|
):
|
|
861
|
-
|
|
862
|
-
that will be catched later to skip the parameters that raised the original Exception."""
|
|
525
|
+
# A simple helper to handle errors with contextual messages.
|
|
863
526
|
try:
|
|
864
527
|
yield
|
|
865
528
|
except Exception as e:
|
|
866
|
-
# During reverse mapping, we do not log and skip errors
|
|
867
|
-
if conversion_errors is None:
|
|
868
|
-
raise e
|
|
869
529
|
|
|
870
|
-
def _format_op_name(curr_op: list[ConversionOps]
|
|
530
|
+
def _format_op_name(curr_op: Union[list[ConversionOps], ConversionOps, None]) -> Optional[str]:
|
|
871
531
|
if curr_op is None:
|
|
872
532
|
return None
|
|
873
533
|
if isinstance(curr_op, (list, tuple, set)):
|
|
@@ -879,21 +539,19 @@ def log_conversion_errors(
|
|
|
879
539
|
|
|
880
540
|
op_name = _format_op_name(op)
|
|
881
541
|
if isinstance(extras, tuple) and len(extras) == 2:
|
|
882
|
-
|
|
542
|
+
values, target_keys = extras
|
|
883
543
|
descriptor = f"{op_name} " if op_name else ""
|
|
884
|
-
|
|
885
|
-
f"{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {
|
|
544
|
+
misc[first_target_key] = (
|
|
545
|
+
f"{e}\nError: {descriptor}on tensors destined for {target_keys}. Ckpt contains: {len(values)}"
|
|
886
546
|
)
|
|
887
547
|
elif isinstance(extras, str):
|
|
888
548
|
suffix = f" via {op_name}" if op_name else ""
|
|
889
|
-
|
|
549
|
+
misc[first_target_key] = f"{e}\nError{suffix} when processing parameter {extras}"
|
|
890
550
|
elif extras is None and op_name:
|
|
891
|
-
|
|
551
|
+
misc[first_target_key] = f"{op_name}: {e}"
|
|
892
552
|
else:
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
# Raise a specific Exception that we can catch easily
|
|
896
|
-
raise SkipParameters()
|
|
553
|
+
misc[first_target_key] = f"{extras} |Error: {e}"
|
|
554
|
+
raise SkipLayer()
|
|
897
555
|
|
|
898
556
|
|
|
899
557
|
def set_param_for_module(
|
|
@@ -902,20 +560,22 @@ def set_param_for_module(
|
|
|
902
560
|
param_value: torch.Tensor,
|
|
903
561
|
mismatch_keys: MutableSet[tuple[str, torch.Size, torch.Size]],
|
|
904
562
|
missing_keys: MutableSet[str],
|
|
563
|
+
misc: MutableMapping[str, Any],
|
|
905
564
|
unexpected_keys: MutableSet[str],
|
|
906
|
-
distributed_operation: TensorParallelLayer
|
|
565
|
+
distributed_operation: Optional[TensorParallelLayer],
|
|
907
566
|
hf_quantizer: HfQuantizer,
|
|
908
567
|
):
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
568
|
+
with log_to_misc(target_name, misc, target_name):
|
|
569
|
+
module_path, _, param_name = target_name.rpartition(".")
|
|
570
|
+
module_obj = model.get_submodule(module_path) if module_path else model
|
|
571
|
+
|
|
572
|
+
ref = getattr(module_obj, param_name)
|
|
573
|
+
if ref is None:
|
|
574
|
+
unexpected_keys.add(target_name)
|
|
575
|
+
else:
|
|
576
|
+
use_dtensor = hasattr(distributed_operation, "use_dtensor") and distributed_operation.use_dtensor
|
|
577
|
+
if not isinstance(param_value, torch.nn.Parameter):
|
|
578
|
+
if distributed_operation is not None:
|
|
919
579
|
param_value = DTensor.from_local(
|
|
920
580
|
param_value,
|
|
921
581
|
distributed_operation.device_mesh,
|
|
@@ -924,17 +584,20 @@ def set_param_for_module(
|
|
|
924
584
|
shape=ref.size(),
|
|
925
585
|
stride=ref.stride(),
|
|
926
586
|
)
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
587
|
+
if not use_dtensor:
|
|
588
|
+
# we convert to local
|
|
589
|
+
param_value = param_value.to_local()
|
|
590
|
+
if param_name not in module_obj._buffers:
|
|
591
|
+
param_value = torch.nn.Parameter(param_value, requires_grad=param_value.is_floating_point())
|
|
592
|
+
|
|
593
|
+
# Remove from missing keys (it's either mismatched, or all good)
|
|
594
|
+
missing_keys.discard(target_name)
|
|
595
|
+
if ref is not None and ref.shape != param_value.shape and hf_quantizer is None:
|
|
596
|
+
mismatch_keys.add((target_name, param_value.shape, ref.shape))
|
|
597
|
+
else:
|
|
598
|
+
# super important otherwise _init_weight will re-init the param
|
|
599
|
+
param_value._is_hf_initialized = True
|
|
600
|
+
setattr(module_obj, param_name, param_value)
|
|
938
601
|
|
|
939
602
|
|
|
940
603
|
def offload_and_maybe_resave_param(
|
|
@@ -956,9 +619,8 @@ def offload_and_maybe_resave_param(
|
|
|
956
619
|
return disk_offload_index
|
|
957
620
|
|
|
958
621
|
|
|
959
|
-
class
|
|
960
|
-
"""Control-flow sentinel: abort processing of the current
|
|
961
|
-
by a WeightConverter)."""
|
|
622
|
+
class SkipLayer(Exception):
|
|
623
|
+
"""Control-flow sentinel: abort processing of the current layer only."""
|
|
962
624
|
|
|
963
625
|
pass
|
|
964
626
|
|
|
@@ -1004,10 +666,15 @@ def rename_source_key(
|
|
|
1004
666
|
def convert_and_load_state_dict_in_model(
|
|
1005
667
|
model: PreTrainedModel,
|
|
1006
668
|
state_dict: dict[str, Any],
|
|
1007
|
-
|
|
669
|
+
weight_mapping: list[WeightConverter | WeightRenaming] | None,
|
|
1008
670
|
tp_plan: dict[str, str] | None,
|
|
671
|
+
hf_quantizer: HfQuantizer | None,
|
|
672
|
+
dtype: torch.dtype | None = None,
|
|
673
|
+
device_map: dict | None = None,
|
|
1009
674
|
dtype_plan: dict | None = None,
|
|
675
|
+
device_mesh: torch.distributed.device_mesh.DeviceMesh | None = None,
|
|
1010
676
|
disk_offload_index: dict | None = None,
|
|
677
|
+
disk_offload_folder: str | None = None,
|
|
1011
678
|
):
|
|
1012
679
|
r"""
|
|
1013
680
|
We build a mapping from the keys obtained by renaming each of the checkpoint keys according to the weight_mapping rules.
|
|
@@ -1021,7 +688,7 @@ def convert_and_load_state_dict_in_model(
|
|
|
1021
688
|
target_patterns=["q", "k","v"],
|
|
1022
689
|
operations=[Chunk(dim=0, chunks=3)]),
|
|
1023
690
|
collected_tensors={
|
|
1024
|
-
"qkv": [Future]},
|
|
691
|
+
"qkv": [Future, Future, Future]},
|
|
1025
692
|
layer_targets={
|
|
1026
693
|
"model.layers.0.attention.q.weight": {"model.layers.0.attention.qkv.weight"},
|
|
1027
694
|
"model.layers.0.attention.k.weight": {"model.layers.0.attention.qkv.weight"},
|
|
@@ -1097,32 +764,26 @@ def convert_and_load_state_dict_in_model(
|
|
|
1097
764
|
"""
|
|
1098
765
|
prefix = model.base_model_prefix
|
|
1099
766
|
tp_plan = tp_plan or {}
|
|
1100
|
-
device_map =
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
offload_buffers = load_config.offload_buffers
|
|
767
|
+
device_map = device_map or {"": "cpu"}
|
|
768
|
+
# Here, we first sort by number of submodules, then length of the full string, to make sure to match correctly
|
|
769
|
+
device_map_regex = re.compile(
|
|
770
|
+
"|".join(rf"({k})" for k in sorted(device_map.keys(), key=lambda x: (x.count("."), len(x)), reverse=True))
|
|
771
|
+
)
|
|
1106
772
|
dtype_plan = dtype_plan or {}
|
|
1107
|
-
weight_mapping =
|
|
773
|
+
weight_mapping = weight_mapping or []
|
|
1108
774
|
meta_model_state_dict = model.state_dict()
|
|
1109
|
-
model_buffers = {k for k, _ in model.named_buffers()}
|
|
1110
|
-
|
|
1111
775
|
missing_keys = set(meta_model_state_dict.keys())
|
|
1112
|
-
|
|
776
|
+
|
|
777
|
+
misc = {}
|
|
1113
778
|
mismatch_keys = set()
|
|
1114
779
|
unexpected_keys = set()
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
# we cannot use it either to control the memory as we are under memory constraints, so we need to be sequential
|
|
1118
|
-
if is_env_variable_true("HF_DEACTIVATE_ASYNC_LOAD") or "disk" in device_map.values():
|
|
1119
|
-
thread_pool = None
|
|
1120
|
-
else:
|
|
1121
|
-
thread_pool = ThreadPoolExecutor(max_workers=GLOBAL_WORKERS)
|
|
780
|
+
# Global thread_pool
|
|
781
|
+
thread_pool = ThreadPoolExecutor(max_workers=GLOBAL_WORKERS)
|
|
1122
782
|
|
|
1123
783
|
renamings = [entry for entry in weight_mapping if isinstance(entry, WeightRenaming)]
|
|
1124
784
|
converters = [entry for entry in weight_mapping if isinstance(entry, WeightConverter)]
|
|
1125
|
-
|
|
785
|
+
|
|
786
|
+
param_name_to_load: dict[str, Union[WeightRenaming | WeightConverter]] = {}
|
|
1126
787
|
|
|
1127
788
|
# build '(?P<g0>.*.*\\.block_sparse_moe\\..*)' and group to source {'g0': '*.block_sparse_moe.'}
|
|
1128
789
|
# and target to source {'g0': '*.mlp.'}. This allows us to quickly find which pattern matched.
|
|
@@ -1165,40 +826,41 @@ def convert_and_load_state_dict_in_model(
|
|
|
1165
826
|
if hf_quantizer and hf_quantizer.pre_quantized and original_key != renamed_key:
|
|
1166
827
|
# if the key was renamed as it is not available in the state dict otherwise, it means that we are deserializing it,
|
|
1167
828
|
# so we need to make sure to load the tensor with the same dtype from the checkpoint
|
|
1168
|
-
# TODO: make the condition more srict for native fp8 model such as qwen2moe fp8
|
|
1169
829
|
_dtype = None
|
|
1170
830
|
elif dtype_plan != {} and dtype_policy_alt.search(renamed_key):
|
|
1171
831
|
matched_dtype_pattern = dtype_policy_alt.search(renamed_key)
|
|
1172
832
|
if matched_dtype_pattern is not None:
|
|
1173
|
-
_dtype = dtype_plan[
|
|
833
|
+
_dtype = dtype_plan[matched_dtype_pattern.group()]
|
|
1174
834
|
elif empty_param is not None and empty_param.dtype != _dtype:
|
|
1175
835
|
_dtype = empty_param.dtype # usually correct when initializing
|
|
1176
836
|
|
|
1177
|
-
# 4. Handle TP sharding or device_map placement
|
|
1178
|
-
|
|
837
|
+
# 4. Handle TP sharding or device_map placement -> scheduled materialization
|
|
838
|
+
future = None
|
|
1179
839
|
if device_mesh:
|
|
1180
840
|
if matched_tp_pattern := tp_plan_alt.search(renamed_key):
|
|
1181
841
|
matched_tp_pattern = tp_plan_by_group_name[matched_tp_pattern.lastgroup]
|
|
1182
842
|
if getattr(mapping, "distributed_operation", None) is None:
|
|
1183
843
|
tp_layer = ALL_PARALLEL_STYLES[model.tp_plan[matched_tp_pattern]].__class__
|
|
1184
844
|
mapping.distributed_operation = tp_layer(
|
|
1185
|
-
device_mesh=device_mesh, rank=
|
|
845
|
+
device_mesh=device_mesh, rank=device_map[""].index, empty_param=empty_param.clone()
|
|
1186
846
|
)
|
|
1187
847
|
shard_index = len(mapping.collected_tensors.get(original_key, []))
|
|
1188
|
-
|
|
848
|
+
future = spawn_tp_materialize(
|
|
1189
849
|
thread_pool,
|
|
1190
850
|
tensor,
|
|
1191
851
|
mapping.distributed_operation,
|
|
1192
852
|
shard_index,
|
|
1193
|
-
device_map[""],
|
|
1194
853
|
_dtype,
|
|
1195
854
|
)
|
|
1196
855
|
|
|
1197
|
-
if
|
|
1198
|
-
|
|
1199
|
-
|
|
856
|
+
if future is None:
|
|
857
|
+
device_match = device_map_regex.match(renamed_key)
|
|
858
|
+
param_device = device_map[device_match.group()] if device_match else device_map.get("", "cpu")
|
|
859
|
+
# If disk, we need to materialize on cpu first
|
|
860
|
+
param_device = "cpu" if param_device == "disk" else param_device
|
|
861
|
+
future = spawn_materialize(thread_pool, tensor, param_device, _dtype)
|
|
1200
862
|
|
|
1201
|
-
mapping.add_tensor(renamed_key, original_key, source_pattern,
|
|
863
|
+
mapping.add_tensor(renamed_key, original_key, source_pattern, future)
|
|
1202
864
|
elif source_pattern is not None: # add all target keys as unexpected
|
|
1203
865
|
mapping = pattern_to_converter[source_pattern]
|
|
1204
866
|
for k in mapping.target_patterns:
|
|
@@ -1206,57 +868,52 @@ def convert_and_load_state_dict_in_model(
|
|
|
1206
868
|
else:
|
|
1207
869
|
unexpected_keys.add(renamed_key)
|
|
1208
870
|
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
# Close the pool, independently of whether the code was interrupted or finished successfully
|
|
1252
|
-
finally:
|
|
1253
|
-
if thread_pool is not None:
|
|
1254
|
-
# `cancel_futures=True` in case the program was interupted, to avoid wasting time on exit
|
|
1255
|
-
thread_pool.shutdown(wait=False, cancel_futures=True)
|
|
871
|
+
total_entries = len(param_name_to_load)
|
|
872
|
+
with logging.tqdm(total=total_entries, desc="Loading weights") as pbar:
|
|
873
|
+
for first_param_name, mapping in param_name_to_load.items():
|
|
874
|
+
pbar.update(1)
|
|
875
|
+
pbar.set_postfix({"Materializing param": first_param_name})
|
|
876
|
+
pbar.refresh()
|
|
877
|
+
try:
|
|
878
|
+
realized_value, misc = mapping.convert(
|
|
879
|
+
first_param_name,
|
|
880
|
+
model=model,
|
|
881
|
+
config=model.config,
|
|
882
|
+
hf_quantizer=hf_quantizer,
|
|
883
|
+
missing_keys=missing_keys,
|
|
884
|
+
misc=misc,
|
|
885
|
+
)
|
|
886
|
+
for target_name, param in realized_value.items():
|
|
887
|
+
param = param[0] if isinstance(param, list) else param
|
|
888
|
+
device_match = device_map_regex.match(target_name)
|
|
889
|
+
param_device = device_map[device_match.group()] if device_match else device_map.get("", "cpu")
|
|
890
|
+
# Offloading support
|
|
891
|
+
if param_device == "disk":
|
|
892
|
+
disk_offload_index = offload_and_maybe_resave_param(
|
|
893
|
+
target_name, param, missing_keys, disk_offload_folder, disk_offload_index, mapping
|
|
894
|
+
)
|
|
895
|
+
else:
|
|
896
|
+
set_param_for_module(
|
|
897
|
+
model,
|
|
898
|
+
target_name,
|
|
899
|
+
param,
|
|
900
|
+
mismatch_keys,
|
|
901
|
+
missing_keys,
|
|
902
|
+
misc,
|
|
903
|
+
unexpected_keys,
|
|
904
|
+
mapping.distributed_operation,
|
|
905
|
+
hf_quantizer,
|
|
906
|
+
)
|
|
907
|
+
|
|
908
|
+
# Cleanup the tensors
|
|
909
|
+
mapping.reset()
|
|
910
|
+
except SkipLayer:
|
|
911
|
+
continue
|
|
1256
912
|
|
|
1257
913
|
# Keep the current weight conversion mapping for later saving (in case it was coming directly from the user)
|
|
1258
914
|
model._weight_conversions = weight_mapping
|
|
1259
|
-
|
|
915
|
+
thread_pool.shutdown(wait=False)
|
|
916
|
+
return missing_keys, unexpected_keys, mismatch_keys, disk_offload_index, misc
|
|
1260
917
|
|
|
1261
918
|
|
|
1262
919
|
def revert_weight_conversion(model: PreTrainedModel, state_dict: dict[str, torch.Tensor]):
|
|
@@ -1303,7 +960,7 @@ def revert_weight_conversion(model: PreTrainedModel, state_dict: dict[str, torch
|
|
|
1303
960
|
new_state_dict = {}
|
|
1304
961
|
for first_param_name, reversed_converter in conversion_mapping.items():
|
|
1305
962
|
# Apply the reverse converter
|
|
1306
|
-
realized_value,
|
|
963
|
+
realized_value, misc = reversed_converter.convert(first_param_name, model=model, config=model.config)
|
|
1307
964
|
for target_name, param in realized_value.items():
|
|
1308
965
|
param = param[0] if isinstance(param, list) else param
|
|
1309
966
|
new_state_dict[target_name] = param
|