transformers 5.0.0__py3-none-any.whl → 5.0.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +36 -55
- transformers/activations.py +1 -1
- transformers/audio_utils.py +33 -32
- transformers/cache_utils.py +139 -32
- transformers/cli/chat.py +3 -3
- transformers/cli/serve.py +19 -49
- transformers/cli/transformers.py +1 -2
- transformers/configuration_utils.py +155 -129
- transformers/conversion_mapping.py +22 -158
- transformers/convert_slow_tokenizer.py +17 -227
- transformers/core_model_loading.py +185 -528
- transformers/data/data_collator.py +4 -12
- transformers/data/processors/glue.py +1 -0
- transformers/data/processors/utils.py +1 -0
- transformers/data/processors/xnli.py +1 -0
- transformers/dependency_versions_check.py +1 -0
- transformers/dependency_versions_table.py +7 -5
- transformers/distributed/configuration_utils.py +2 -1
- transformers/dynamic_module_utils.py +25 -24
- transformers/feature_extraction_sequence_utils.py +23 -19
- transformers/feature_extraction_utils.py +33 -64
- transformers/file_utils.py +1 -0
- transformers/generation/__init__.py +1 -11
- transformers/generation/candidate_generator.py +33 -80
- transformers/generation/configuration_utils.py +133 -189
- transformers/generation/continuous_batching/__init__.py +1 -4
- transformers/generation/continuous_batching/cache.py +25 -83
- transformers/generation/continuous_batching/cache_manager.py +45 -155
- transformers/generation/continuous_batching/continuous_api.py +147 -270
- transformers/generation/continuous_batching/requests.py +3 -51
- transformers/generation/continuous_batching/scheduler.py +105 -160
- transformers/generation/logits_process.py +128 -0
- transformers/generation/stopping_criteria.py +1 -1
- transformers/generation/streamers.py +1 -0
- transformers/generation/utils.py +123 -122
- transformers/generation/watermarking.py +6 -8
- transformers/hf_argparser.py +13 -9
- transformers/hyperparameter_search.py +2 -1
- transformers/image_processing_base.py +23 -12
- transformers/image_processing_utils.py +15 -11
- transformers/image_processing_utils_fast.py +75 -85
- transformers/image_transforms.py +42 -73
- transformers/image_utils.py +32 -30
- transformers/initialization.py +0 -37
- transformers/integrations/__init__.py +2 -16
- transformers/integrations/accelerate.py +113 -58
- transformers/integrations/aqlm.py +66 -36
- transformers/integrations/awq.py +516 -45
- transformers/integrations/bitnet.py +105 -47
- transformers/integrations/bitsandbytes.py +202 -91
- transformers/integrations/deepspeed.py +4 -161
- transformers/integrations/eetq.py +82 -84
- transformers/integrations/executorch.py +1 -1
- transformers/integrations/fbgemm_fp8.py +145 -190
- transformers/integrations/finegrained_fp8.py +215 -249
- transformers/integrations/flash_attention.py +3 -3
- transformers/integrations/flex_attention.py +1 -1
- transformers/integrations/fp_quant.py +0 -90
- transformers/integrations/ggml.py +2 -11
- transformers/integrations/higgs.py +62 -37
- transformers/integrations/hub_kernels.py +8 -65
- transformers/integrations/integration_utils.py +3 -47
- transformers/integrations/mistral.py +0 -12
- transformers/integrations/mxfp4.py +80 -33
- transformers/integrations/peft.py +191 -483
- transformers/integrations/quanto.py +56 -77
- transformers/integrations/spqr.py +90 -42
- transformers/integrations/tensor_parallel.py +221 -167
- transformers/integrations/torchao.py +43 -35
- transformers/integrations/vptq.py +59 -40
- transformers/kernels/__init__.py +0 -0
- transformers/{models/pe_audio_video/processing_pe_audio_video.py → kernels/falcon_mamba/__init__.py} +3 -12
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +529 -0
- transformers/loss/loss_utils.py +0 -2
- transformers/masking_utils.py +55 -51
- transformers/model_debugging_utils.py +5 -4
- transformers/modelcard.py +194 -15
- transformers/modeling_attn_mask_utils.py +19 -19
- transformers/modeling_flash_attention_utils.py +27 -27
- transformers/modeling_gguf_pytorch_utils.py +24 -79
- transformers/modeling_layers.py +22 -21
- transformers/modeling_outputs.py +253 -242
- transformers/modeling_rope_utils.py +117 -138
- transformers/modeling_utils.py +739 -850
- transformers/models/__init__.py +0 -27
- transformers/models/afmoe/configuration_afmoe.py +33 -40
- transformers/models/afmoe/modeling_afmoe.py +54 -42
- transformers/models/afmoe/modular_afmoe.py +33 -23
- transformers/models/aimv2/configuration_aimv2.py +10 -2
- transformers/models/aimv2/modeling_aimv2.py +42 -47
- transformers/models/aimv2/modular_aimv2.py +19 -17
- transformers/models/albert/configuration_albert.py +2 -8
- transformers/models/albert/modeling_albert.py +69 -70
- transformers/models/albert/tokenization_albert.py +14 -5
- transformers/models/align/configuration_align.py +6 -8
- transformers/models/align/modeling_align.py +89 -94
- transformers/models/align/processing_align.py +30 -2
- transformers/models/altclip/configuration_altclip.py +7 -4
- transformers/models/altclip/modeling_altclip.py +103 -114
- transformers/models/altclip/processing_altclip.py +15 -2
- transformers/models/apertus/__init__.py +1 -0
- transformers/models/apertus/configuration_apertus.py +28 -23
- transformers/models/apertus/modeling_apertus.py +40 -39
- transformers/models/apertus/modular_apertus.py +38 -37
- transformers/models/arcee/configuration_arcee.py +30 -25
- transformers/models/arcee/modeling_arcee.py +39 -36
- transformers/models/arcee/modular_arcee.py +23 -20
- transformers/models/aria/configuration_aria.py +44 -31
- transformers/models/aria/image_processing_aria.py +27 -25
- transformers/models/aria/modeling_aria.py +106 -110
- transformers/models/aria/modular_aria.py +127 -118
- transformers/models/aria/processing_aria.py +35 -28
- transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +1 -0
- transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +6 -3
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +8 -6
- transformers/models/audioflamingo3/__init__.py +1 -0
- transformers/models/audioflamingo3/configuration_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +49 -58
- transformers/models/audioflamingo3/modular_audioflamingo3.py +43 -53
- transformers/models/audioflamingo3/processing_audioflamingo3.py +30 -33
- transformers/models/auto/auto_factory.py +7 -6
- transformers/models/auto/configuration_auto.py +5 -66
- transformers/models/auto/feature_extraction_auto.py +10 -14
- transformers/models/auto/image_processing_auto.py +41 -32
- transformers/models/auto/modeling_auto.py +188 -46
- transformers/models/auto/processing_auto.py +11 -24
- transformers/models/auto/tokenization_auto.py +588 -171
- transformers/models/auto/video_processing_auto.py +10 -12
- transformers/models/autoformer/configuration_autoformer.py +7 -4
- transformers/models/autoformer/modeling_autoformer.py +101 -104
- transformers/models/aya_vision/configuration_aya_vision.py +1 -4
- transformers/models/aya_vision/modeling_aya_vision.py +102 -71
- transformers/models/aya_vision/modular_aya_vision.py +74 -46
- transformers/models/aya_vision/processing_aya_vision.py +53 -25
- transformers/models/bamba/configuration_bamba.py +39 -34
- transformers/models/bamba/modeling_bamba.py +86 -82
- transformers/models/bamba/modular_bamba.py +72 -70
- transformers/models/bark/configuration_bark.py +8 -6
- transformers/models/bark/generation_configuration_bark.py +5 -3
- transformers/models/bark/modeling_bark.py +57 -54
- transformers/models/bark/processing_bark.py +41 -19
- transformers/models/bart/configuration_bart.py +6 -9
- transformers/models/bart/modeling_bart.py +126 -135
- transformers/models/barthez/tokenization_barthez.py +11 -3
- transformers/models/bartpho/tokenization_bartpho.py +7 -6
- transformers/models/beit/configuration_beit.py +11 -0
- transformers/models/beit/image_processing_beit.py +56 -53
- transformers/models/beit/image_processing_beit_fast.py +12 -10
- transformers/models/beit/modeling_beit.py +60 -69
- transformers/models/bert/configuration_bert.py +2 -12
- transformers/models/bert/modeling_bert.py +122 -114
- transformers/models/bert/tokenization_bert.py +23 -8
- transformers/models/bert/tokenization_bert_legacy.py +5 -3
- transformers/models/bert_generation/configuration_bert_generation.py +2 -17
- transformers/models/bert_generation/modeling_bert_generation.py +49 -49
- transformers/models/bert_generation/tokenization_bert_generation.py +3 -2
- transformers/models/bert_japanese/tokenization_bert_japanese.py +6 -5
- transformers/models/bertweet/tokenization_bertweet.py +3 -1
- transformers/models/big_bird/configuration_big_bird.py +9 -12
- transformers/models/big_bird/modeling_big_bird.py +109 -116
- transformers/models/big_bird/tokenization_big_bird.py +43 -16
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -9
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +117 -130
- transformers/models/biogpt/configuration_biogpt.py +2 -8
- transformers/models/biogpt/modeling_biogpt.py +76 -72
- transformers/models/biogpt/modular_biogpt.py +66 -62
- transformers/models/biogpt/tokenization_biogpt.py +5 -3
- transformers/models/bit/configuration_bit.py +1 -0
- transformers/models/bit/image_processing_bit.py +24 -21
- transformers/models/bit/image_processing_bit_fast.py +1 -0
- transformers/models/bit/modeling_bit.py +12 -25
- transformers/models/bitnet/configuration_bitnet.py +28 -23
- transformers/models/bitnet/modeling_bitnet.py +39 -36
- transformers/models/bitnet/modular_bitnet.py +6 -4
- transformers/models/blenderbot/configuration_blenderbot.py +5 -8
- transformers/models/blenderbot/modeling_blenderbot.py +96 -77
- transformers/models/blenderbot/tokenization_blenderbot.py +24 -18
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +5 -8
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +69 -79
- transformers/models/blenderbot_small/tokenization_blenderbot_small.py +3 -1
- transformers/models/blip/configuration_blip.py +10 -9
- transformers/models/blip/image_processing_blip.py +20 -17
- transformers/models/blip/image_processing_blip_fast.py +1 -0
- transformers/models/blip/modeling_blip.py +108 -117
- transformers/models/blip/modeling_blip_text.py +65 -73
- transformers/models/blip/processing_blip.py +36 -5
- transformers/models/blip_2/configuration_blip_2.py +2 -2
- transformers/models/blip_2/modeling_blip_2.py +118 -146
- transformers/models/blip_2/processing_blip_2.py +38 -8
- transformers/models/bloom/configuration_bloom.py +2 -5
- transformers/models/bloom/modeling_bloom.py +104 -77
- transformers/models/blt/configuration_blt.py +86 -94
- transformers/models/blt/modeling_blt.py +81 -238
- transformers/models/blt/modular_blt.py +65 -228
- transformers/models/bridgetower/configuration_bridgetower.py +2 -7
- transformers/models/bridgetower/image_processing_bridgetower.py +35 -34
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +16 -13
- transformers/models/bridgetower/modeling_bridgetower.py +119 -141
- transformers/models/bridgetower/processing_bridgetower.py +16 -2
- transformers/models/bros/configuration_bros.py +18 -24
- transformers/models/bros/modeling_bros.py +80 -90
- transformers/models/bros/processing_bros.py +12 -2
- transformers/models/byt5/tokenization_byt5.py +6 -4
- transformers/models/camembert/configuration_camembert.py +2 -8
- transformers/models/camembert/modeling_camembert.py +195 -196
- transformers/models/camembert/modular_camembert.py +54 -51
- transformers/models/camembert/tokenization_camembert.py +13 -6
- transformers/models/canine/configuration_canine.py +2 -4
- transformers/models/canine/modeling_canine.py +75 -84
- transformers/models/canine/tokenization_canine.py +1 -2
- transformers/models/chameleon/configuration_chameleon.py +34 -29
- transformers/models/chameleon/image_processing_chameleon.py +24 -21
- transformers/models/chameleon/image_processing_chameleon_fast.py +6 -5
- transformers/models/chameleon/modeling_chameleon.py +93 -142
- transformers/models/chameleon/processing_chameleon.py +41 -16
- transformers/models/chinese_clip/configuration_chinese_clip.py +8 -10
- transformers/models/chinese_clip/image_processing_chinese_clip.py +24 -21
- transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +1 -0
- transformers/models/chinese_clip/modeling_chinese_clip.py +92 -96
- transformers/models/chinese_clip/processing_chinese_clip.py +15 -2
- transformers/models/clap/configuration_clap.py +9 -4
- transformers/models/clap/feature_extraction_clap.py +12 -11
- transformers/models/clap/modeling_clap.py +123 -136
- transformers/models/clap/processing_clap.py +15 -2
- transformers/models/clip/configuration_clip.py +2 -4
- transformers/models/clip/image_processing_clip.py +24 -21
- transformers/models/clip/image_processing_clip_fast.py +1 -9
- transformers/models/clip/modeling_clip.py +65 -65
- transformers/models/clip/processing_clip.py +14 -2
- transformers/models/clip/tokenization_clip.py +46 -21
- transformers/models/clipseg/configuration_clipseg.py +2 -4
- transformers/models/clipseg/modeling_clipseg.py +109 -119
- transformers/models/clipseg/processing_clipseg.py +42 -19
- transformers/models/clvp/configuration_clvp.py +5 -15
- transformers/models/clvp/feature_extraction_clvp.py +10 -7
- transformers/models/clvp/modeling_clvp.py +146 -155
- transformers/models/clvp/number_normalizer.py +2 -1
- transformers/models/clvp/processing_clvp.py +20 -3
- transformers/models/clvp/tokenization_clvp.py +64 -1
- transformers/models/code_llama/tokenization_code_llama.py +44 -18
- transformers/models/codegen/configuration_codegen.py +4 -4
- transformers/models/codegen/modeling_codegen.py +53 -63
- transformers/models/codegen/tokenization_codegen.py +47 -17
- transformers/models/cohere/configuration_cohere.py +30 -25
- transformers/models/cohere/modeling_cohere.py +42 -40
- transformers/models/cohere/modular_cohere.py +29 -26
- transformers/models/cohere/tokenization_cohere.py +46 -15
- transformers/models/cohere2/configuration_cohere2.py +32 -31
- transformers/models/cohere2/modeling_cohere2.py +44 -42
- transformers/models/cohere2/modular_cohere2.py +54 -54
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +14 -13
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +58 -59
- transformers/models/cohere2_vision/modular_cohere2_vision.py +46 -45
- transformers/models/cohere2_vision/processing_cohere2_vision.py +36 -6
- transformers/models/colpali/configuration_colpali.py +1 -0
- transformers/models/colpali/modeling_colpali.py +16 -14
- transformers/models/colpali/modular_colpali.py +51 -11
- transformers/models/colpali/processing_colpali.py +52 -14
- transformers/models/colqwen2/modeling_colqwen2.py +28 -28
- transformers/models/colqwen2/modular_colqwen2.py +74 -37
- transformers/models/colqwen2/processing_colqwen2.py +52 -16
- transformers/models/conditional_detr/configuration_conditional_detr.py +2 -1
- transformers/models/conditional_detr/image_processing_conditional_detr.py +70 -67
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +36 -36
- transformers/models/conditional_detr/modeling_conditional_detr.py +87 -99
- transformers/models/conditional_detr/modular_conditional_detr.py +3 -49
- transformers/models/convbert/configuration_convbert.py +8 -11
- transformers/models/convbert/modeling_convbert.py +87 -94
- transformers/models/convbert/tokenization_convbert.py +1 -0
- transformers/models/convnext/configuration_convnext.py +1 -0
- transformers/models/convnext/image_processing_convnext.py +23 -20
- transformers/models/convnext/image_processing_convnext_fast.py +21 -16
- transformers/models/convnext/modeling_convnext.py +12 -9
- transformers/models/convnextv2/configuration_convnextv2.py +1 -0
- transformers/models/convnextv2/modeling_convnextv2.py +12 -9
- transformers/models/cpm/tokenization_cpm.py +7 -6
- transformers/models/cpm/tokenization_cpm_fast.py +5 -3
- transformers/models/cpmant/configuration_cpmant.py +1 -4
- transformers/models/cpmant/modeling_cpmant.py +40 -38
- transformers/models/cpmant/tokenization_cpmant.py +3 -1
- transformers/models/csm/configuration_csm.py +66 -58
- transformers/models/csm/generation_csm.py +35 -31
- transformers/models/csm/modeling_csm.py +85 -85
- transformers/models/csm/modular_csm.py +58 -58
- transformers/models/csm/processing_csm.py +68 -25
- transformers/models/ctrl/configuration_ctrl.py +1 -16
- transformers/models/ctrl/modeling_ctrl.py +44 -54
- transformers/models/ctrl/tokenization_ctrl.py +1 -0
- transformers/models/cvt/configuration_cvt.py +1 -0
- transformers/models/cvt/modeling_cvt.py +16 -20
- transformers/models/cwm/__init__.py +1 -0
- transformers/models/cwm/configuration_cwm.py +12 -8
- transformers/models/cwm/modeling_cwm.py +39 -37
- transformers/models/cwm/modular_cwm.py +12 -10
- transformers/models/d_fine/configuration_d_fine.py +5 -7
- transformers/models/d_fine/modeling_d_fine.py +128 -138
- transformers/models/d_fine/modular_d_fine.py +18 -33
- transformers/models/dab_detr/configuration_dab_detr.py +3 -6
- transformers/models/dab_detr/modeling_dab_detr.py +75 -81
- transformers/models/dac/configuration_dac.py +1 -0
- transformers/models/dac/feature_extraction_dac.py +9 -6
- transformers/models/dac/modeling_dac.py +26 -24
- transformers/models/data2vec/configuration_data2vec_audio.py +2 -4
- transformers/models/data2vec/configuration_data2vec_text.py +3 -11
- transformers/models/data2vec/configuration_data2vec_vision.py +1 -0
- transformers/models/data2vec/modeling_data2vec_audio.py +56 -57
- transformers/models/data2vec/modeling_data2vec_text.py +93 -98
- transformers/models/data2vec/modeling_data2vec_vision.py +45 -49
- transformers/models/data2vec/modular_data2vec_audio.py +1 -6
- transformers/models/data2vec/modular_data2vec_text.py +54 -58
- transformers/models/dbrx/configuration_dbrx.py +22 -36
- transformers/models/dbrx/modeling_dbrx.py +45 -42
- transformers/models/dbrx/modular_dbrx.py +33 -31
- transformers/models/deberta/configuration_deberta.py +1 -6
- transformers/models/deberta/modeling_deberta.py +60 -64
- transformers/models/deberta/tokenization_deberta.py +21 -9
- transformers/models/deberta_v2/configuration_deberta_v2.py +1 -6
- transformers/models/deberta_v2/modeling_deberta_v2.py +65 -71
- transformers/models/deberta_v2/tokenization_deberta_v2.py +29 -11
- transformers/models/decision_transformer/configuration_decision_transformer.py +2 -3
- transformers/models/decision_transformer/modeling_decision_transformer.py +56 -60
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +44 -39
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +43 -43
- transformers/models/deepseek_v2/modular_deepseek_v2.py +49 -48
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +45 -40
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +42 -45
- transformers/models/deepseek_v3/modular_deepseek_v3.py +9 -14
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +3 -2
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +26 -25
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +10 -10
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +48 -57
- transformers/models/deepseek_vl/modular_deepseek_vl.py +43 -14
- transformers/models/deepseek_vl/processing_deepseek_vl.py +41 -10
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +5 -3
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +35 -35
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +24 -20
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +61 -109
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +118 -146
- transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +44 -12
- transformers/models/deformable_detr/configuration_deformable_detr.py +3 -2
- transformers/models/deformable_detr/image_processing_deformable_detr.py +61 -59
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +28 -28
- transformers/models/deformable_detr/modeling_deformable_detr.py +82 -88
- transformers/models/deformable_detr/modular_deformable_detr.py +3 -1
- transformers/models/deit/configuration_deit.py +1 -0
- transformers/models/deit/image_processing_deit.py +21 -18
- transformers/models/deit/image_processing_deit_fast.py +1 -0
- transformers/models/deit/modeling_deit.py +22 -24
- transformers/models/depth_anything/configuration_depth_anything.py +4 -2
- transformers/models/depth_anything/modeling_depth_anything.py +10 -10
- transformers/models/depth_pro/configuration_depth_pro.py +1 -0
- transformers/models/depth_pro/image_processing_depth_pro.py +23 -22
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +10 -8
- transformers/models/depth_pro/modeling_depth_pro.py +27 -31
- transformers/models/detr/configuration_detr.py +2 -1
- transformers/models/detr/image_processing_detr.py +66 -64
- transformers/models/detr/image_processing_detr_fast.py +34 -33
- transformers/models/detr/modeling_detr.py +79 -95
- transformers/models/dia/configuration_dia.py +15 -9
- transformers/models/dia/feature_extraction_dia.py +9 -6
- transformers/models/dia/generation_dia.py +50 -48
- transformers/models/dia/modeling_dia.py +69 -78
- transformers/models/dia/modular_dia.py +56 -64
- transformers/models/dia/processing_dia.py +29 -39
- transformers/models/dia/tokenization_dia.py +6 -3
- transformers/models/diffllama/configuration_diffllama.py +30 -25
- transformers/models/diffllama/modeling_diffllama.py +49 -46
- transformers/models/diffllama/modular_diffllama.py +19 -17
- transformers/models/dinat/configuration_dinat.py +1 -0
- transformers/models/dinat/modeling_dinat.py +44 -47
- transformers/models/dinov2/configuration_dinov2.py +1 -0
- transformers/models/dinov2/modeling_dinov2.py +15 -15
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +1 -1
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +15 -16
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +9 -9
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +7 -4
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +6 -3
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +8 -5
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +9 -7
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +18 -19
- transformers/models/dinov3_vit/modular_dinov3_vit.py +15 -16
- transformers/models/distilbert/configuration_distilbert.py +2 -8
- transformers/models/distilbert/modeling_distilbert.py +55 -55
- transformers/models/distilbert/tokenization_distilbert.py +1 -13
- transformers/models/doge/__init__.py +1 -0
- transformers/models/doge/configuration_doge.py +32 -39
- transformers/models/doge/modeling_doge.py +49 -45
- transformers/models/doge/modular_doge.py +63 -71
- transformers/models/donut/configuration_donut_swin.py +1 -0
- transformers/models/donut/image_processing_donut.py +29 -26
- transformers/models/donut/image_processing_donut_fast.py +15 -9
- transformers/models/donut/modeling_donut_swin.py +58 -62
- transformers/models/donut/processing_donut.py +26 -5
- transformers/models/dots1/configuration_dots1.py +33 -41
- transformers/models/dots1/modeling_dots1.py +45 -54
- transformers/models/dots1/modular_dots1.py +4 -5
- transformers/models/dpr/configuration_dpr.py +2 -19
- transformers/models/dpr/modeling_dpr.py +39 -42
- transformers/models/dpr/tokenization_dpr.py +9 -19
- transformers/models/dpr/tokenization_dpr_fast.py +9 -7
- transformers/models/dpt/configuration_dpt.py +2 -1
- transformers/models/dpt/image_processing_dpt.py +66 -65
- transformers/models/dpt/image_processing_dpt_fast.py +20 -18
- transformers/models/dpt/modeling_dpt.py +30 -32
- transformers/models/dpt/modular_dpt.py +17 -15
- transformers/models/edgetam/configuration_edgetam.py +3 -2
- transformers/models/edgetam/modeling_edgetam.py +86 -86
- transformers/models/edgetam/modular_edgetam.py +26 -21
- transformers/models/edgetam_video/__init__.py +1 -0
- transformers/models/edgetam_video/configuration_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modeling_edgetam_video.py +158 -169
- transformers/models/edgetam_video/modular_edgetam_video.py +37 -30
- transformers/models/efficientloftr/configuration_efficientloftr.py +5 -4
- transformers/models/efficientloftr/image_processing_efficientloftr.py +16 -14
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +9 -9
- transformers/models/efficientloftr/modeling_efficientloftr.py +38 -59
- transformers/models/efficientloftr/modular_efficientloftr.py +3 -1
- transformers/models/efficientnet/configuration_efficientnet.py +1 -0
- transformers/models/efficientnet/image_processing_efficientnet.py +32 -28
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +19 -17
- transformers/models/efficientnet/modeling_efficientnet.py +15 -19
- transformers/models/electra/configuration_electra.py +3 -13
- transformers/models/electra/modeling_electra.py +103 -108
- transformers/models/emu3/configuration_emu3.py +17 -13
- transformers/models/emu3/image_processing_emu3.py +39 -44
- transformers/models/emu3/modeling_emu3.py +108 -148
- transformers/models/emu3/modular_emu3.py +73 -115
- transformers/models/emu3/processing_emu3.py +43 -18
- transformers/models/encodec/configuration_encodec.py +4 -2
- transformers/models/encodec/feature_extraction_encodec.py +13 -10
- transformers/models/encodec/modeling_encodec.py +29 -39
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +2 -12
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +43 -37
- transformers/models/eomt/configuration_eomt.py +1 -0
- transformers/models/eomt/image_processing_eomt.py +56 -66
- transformers/models/eomt/image_processing_eomt_fast.py +33 -76
- transformers/models/eomt/modeling_eomt.py +18 -23
- transformers/models/eomt/modular_eomt.py +13 -18
- transformers/models/ernie/configuration_ernie.py +3 -24
- transformers/models/ernie/modeling_ernie.py +132 -127
- transformers/models/ernie/modular_ernie.py +103 -97
- transformers/models/ernie4_5/configuration_ernie4_5.py +27 -23
- transformers/models/ernie4_5/modeling_ernie4_5.py +38 -36
- transformers/models/ernie4_5/modular_ernie4_5.py +4 -3
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +36 -32
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +55 -56
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +46 -18
- transformers/models/esm/configuration_esm.py +15 -11
- transformers/models/esm/modeling_esm.py +34 -38
- transformers/models/esm/modeling_esmfold.py +49 -53
- transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
- transformers/models/esm/openfold_utils/loss.py +2 -1
- transformers/models/esm/openfold_utils/protein.py +16 -15
- transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
- transformers/models/esm/tokenization_esm.py +4 -2
- transformers/models/evolla/configuration_evolla.py +40 -50
- transformers/models/evolla/modeling_evolla.py +66 -71
- transformers/models/evolla/modular_evolla.py +47 -53
- transformers/models/evolla/processing_evolla.py +35 -23
- transformers/models/exaone4/configuration_exaone4.py +25 -23
- transformers/models/exaone4/modeling_exaone4.py +38 -35
- transformers/models/exaone4/modular_exaone4.py +46 -44
- transformers/models/falcon/configuration_falcon.py +26 -31
- transformers/models/falcon/modeling_falcon.py +80 -82
- transformers/models/falcon_h1/configuration_falcon_h1.py +51 -45
- transformers/models/falcon_h1/modeling_falcon_h1.py +82 -85
- transformers/models/falcon_h1/modular_falcon_h1.py +51 -56
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +2 -1
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +82 -75
- transformers/models/falcon_mamba/modular_falcon_mamba.py +45 -28
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +6 -2
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +60 -76
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +3 -2
- transformers/models/flaubert/configuration_flaubert.py +5 -10
- transformers/models/flaubert/modeling_flaubert.py +143 -145
- transformers/models/flaubert/tokenization_flaubert.py +5 -3
- transformers/models/flava/configuration_flava.py +6 -5
- transformers/models/flava/image_processing_flava.py +67 -66
- transformers/models/flava/image_processing_flava_fast.py +49 -46
- transformers/models/flava/modeling_flava.py +136 -153
- transformers/models/flava/processing_flava.py +12 -2
- transformers/models/flex_olmo/__init__.py +1 -0
- transformers/models/flex_olmo/configuration_flex_olmo.py +32 -28
- transformers/models/flex_olmo/modeling_flex_olmo.py +47 -47
- transformers/models/flex_olmo/modular_flex_olmo.py +44 -40
- transformers/models/florence2/configuration_florence2.py +1 -0
- transformers/models/florence2/modeling_florence2.py +69 -111
- transformers/models/florence2/modular_florence2.py +101 -104
- transformers/models/florence2/processing_florence2.py +47 -18
- transformers/models/fnet/configuration_fnet.py +2 -6
- transformers/models/fnet/modeling_fnet.py +80 -83
- transformers/models/fnet/tokenization_fnet.py +1 -0
- transformers/models/focalnet/configuration_focalnet.py +1 -0
- transformers/models/focalnet/modeling_focalnet.py +45 -51
- transformers/models/fsmt/configuration_fsmt.py +17 -12
- transformers/models/fsmt/modeling_fsmt.py +48 -49
- transformers/models/fsmt/tokenization_fsmt.py +5 -3
- transformers/models/funnel/configuration_funnel.py +1 -8
- transformers/models/funnel/modeling_funnel.py +93 -99
- transformers/models/funnel/tokenization_funnel.py +27 -17
- transformers/models/fuyu/configuration_fuyu.py +34 -28
- transformers/models/fuyu/image_processing_fuyu.py +31 -29
- transformers/models/fuyu/image_processing_fuyu_fast.py +17 -17
- transformers/models/fuyu/modeling_fuyu.py +53 -53
- transformers/models/fuyu/processing_fuyu.py +34 -23
- transformers/models/gemma/configuration_gemma.py +30 -25
- transformers/models/gemma/modeling_gemma.py +50 -46
- transformers/models/gemma/modular_gemma.py +47 -42
- transformers/models/gemma/tokenization_gemma.py +30 -10
- transformers/models/gemma2/configuration_gemma2.py +35 -30
- transformers/models/gemma2/modeling_gemma2.py +42 -39
- transformers/models/gemma2/modular_gemma2.py +66 -63
- transformers/models/gemma3/configuration_gemma3.py +44 -44
- transformers/models/gemma3/image_processing_gemma3.py +31 -29
- transformers/models/gemma3/image_processing_gemma3_fast.py +13 -11
- transformers/models/gemma3/modeling_gemma3.py +207 -159
- transformers/models/gemma3/modular_gemma3.py +204 -153
- transformers/models/gemma3/processing_gemma3.py +5 -5
- transformers/models/gemma3n/configuration_gemma3n.py +26 -36
- transformers/models/gemma3n/feature_extraction_gemma3n.py +11 -9
- transformers/models/gemma3n/modeling_gemma3n.py +356 -222
- transformers/models/gemma3n/modular_gemma3n.py +207 -230
- transformers/models/gemma3n/processing_gemma3n.py +26 -12
- transformers/models/git/configuration_git.py +8 -5
- transformers/models/git/modeling_git.py +204 -266
- transformers/models/git/processing_git.py +14 -2
- transformers/models/glm/configuration_glm.py +28 -24
- transformers/models/glm/modeling_glm.py +40 -37
- transformers/models/glm/modular_glm.py +7 -4
- transformers/models/glm4/configuration_glm4.py +28 -24
- transformers/models/glm4/modeling_glm4.py +42 -40
- transformers/models/glm4/modular_glm4.py +10 -8
- transformers/models/glm46v/configuration_glm46v.py +1 -0
- transformers/models/glm46v/image_processing_glm46v.py +40 -35
- transformers/models/glm46v/image_processing_glm46v_fast.py +9 -9
- transformers/models/glm46v/modeling_glm46v.py +90 -137
- transformers/models/glm46v/modular_glm46v.py +3 -4
- transformers/models/glm46v/processing_glm46v.py +41 -7
- transformers/models/glm46v/video_processing_glm46v.py +11 -9
- transformers/models/glm4_moe/configuration_glm4_moe.py +32 -40
- transformers/models/glm4_moe/modeling_glm4_moe.py +42 -45
- transformers/models/glm4_moe/modular_glm4_moe.py +34 -42
- transformers/models/glm4v/configuration_glm4v.py +20 -18
- transformers/models/glm4v/image_processing_glm4v.py +40 -34
- transformers/models/glm4v/image_processing_glm4v_fast.py +9 -8
- transformers/models/glm4v/modeling_glm4v.py +205 -254
- transformers/models/glm4v/modular_glm4v.py +224 -210
- transformers/models/glm4v/processing_glm4v.py +41 -7
- transformers/models/glm4v/video_processing_glm4v.py +11 -9
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +125 -136
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +368 -377
- transformers/models/glm4v_moe/modular_glm4v_moe.py +169 -83
- transformers/models/glpn/configuration_glpn.py +1 -0
- transformers/models/glpn/image_processing_glpn.py +12 -11
- transformers/models/glpn/image_processing_glpn_fast.py +13 -11
- transformers/models/glpn/modeling_glpn.py +14 -16
- transformers/models/got_ocr2/configuration_got_ocr2.py +12 -4
- transformers/models/got_ocr2/image_processing_got_ocr2.py +24 -22
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +11 -9
- transformers/models/got_ocr2/modeling_got_ocr2.py +80 -77
- transformers/models/got_ocr2/modular_got_ocr2.py +51 -54
- transformers/models/got_ocr2/processing_got_ocr2.py +63 -42
- transformers/models/gpt2/configuration_gpt2.py +2 -13
- transformers/models/gpt2/modeling_gpt2.py +115 -120
- transformers/models/gpt2/tokenization_gpt2.py +46 -15
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +2 -5
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +89 -79
- transformers/models/gpt_neo/configuration_gpt_neo.py +2 -9
- transformers/models/gpt_neo/modeling_gpt_neo.py +67 -83
- transformers/models/gpt_neox/configuration_gpt_neox.py +25 -25
- transformers/models/gpt_neox/modeling_gpt_neox.py +75 -76
- transformers/models/gpt_neox/modular_gpt_neox.py +66 -67
- transformers/models/gpt_neox/tokenization_gpt_neox.py +51 -9
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +19 -24
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +47 -46
- transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/configuration_gpt_oss.py +28 -46
- transformers/models/gpt_oss/modeling_gpt_oss.py +121 -83
- transformers/models/gpt_oss/modular_gpt_oss.py +103 -64
- transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
- transformers/models/gptj/configuration_gptj.py +4 -4
- transformers/models/gptj/modeling_gptj.py +87 -101
- transformers/models/granite/configuration_granite.py +33 -28
- transformers/models/granite/modeling_granite.py +46 -44
- transformers/models/granite/modular_granite.py +31 -29
- transformers/models/granite_speech/configuration_granite_speech.py +1 -0
- transformers/models/granite_speech/feature_extraction_granite_speech.py +3 -1
- transformers/models/granite_speech/modeling_granite_speech.py +52 -82
- transformers/models/granite_speech/processing_granite_speech.py +4 -11
- transformers/models/granitemoe/configuration_granitemoe.py +36 -31
- transformers/models/granitemoe/modeling_granitemoe.py +46 -41
- transformers/models/granitemoe/modular_granitemoe.py +27 -22
- transformers/models/granitemoehybrid/__init__.py +1 -0
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +47 -46
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +93 -97
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +21 -54
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +37 -33
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +61 -54
- transformers/models/granitemoeshared/modular_granitemoeshared.py +21 -19
- transformers/models/grounding_dino/configuration_grounding_dino.py +4 -6
- transformers/models/grounding_dino/image_processing_grounding_dino.py +62 -60
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +29 -28
- transformers/models/grounding_dino/modeling_grounding_dino.py +140 -155
- transformers/models/grounding_dino/modular_grounding_dino.py +3 -2
- transformers/models/grounding_dino/processing_grounding_dino.py +38 -10
- transformers/models/groupvit/configuration_groupvit.py +2 -4
- transformers/models/groupvit/modeling_groupvit.py +93 -107
- transformers/models/helium/configuration_helium.py +29 -25
- transformers/models/helium/modeling_helium.py +40 -38
- transformers/models/helium/modular_helium.py +7 -3
- transformers/models/herbert/tokenization_herbert.py +28 -10
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +1 -0
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -24
- transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -24
- transformers/models/hiera/configuration_hiera.py +1 -0
- transformers/models/hiera/modeling_hiera.py +66 -72
- transformers/models/hubert/configuration_hubert.py +2 -4
- transformers/models/hubert/modeling_hubert.py +37 -42
- transformers/models/hubert/modular_hubert.py +11 -13
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +31 -26
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +38 -35
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +6 -4
- transformers/models/hunyuan_v1_moe/__init__.py +1 -1
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +36 -31
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +42 -47
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +9 -9
- transformers/models/ibert/configuration_ibert.py +2 -4
- transformers/models/ibert/modeling_ibert.py +62 -82
- transformers/models/ibert/quant_modules.py +1 -0
- transformers/models/idefics/configuration_idefics.py +8 -5
- transformers/models/idefics/image_processing_idefics.py +15 -13
- transformers/models/idefics/modeling_idefics.py +82 -75
- transformers/models/idefics/perceiver.py +3 -1
- transformers/models/idefics/processing_idefics.py +48 -32
- transformers/models/idefics/vision.py +25 -24
- transformers/models/idefics2/configuration_idefics2.py +3 -1
- transformers/models/idefics2/image_processing_idefics2.py +32 -31
- transformers/models/idefics2/image_processing_idefics2_fast.py +8 -8
- transformers/models/idefics2/modeling_idefics2.py +101 -127
- transformers/models/idefics2/processing_idefics2.py +68 -10
- transformers/models/idefics3/configuration_idefics3.py +4 -1
- transformers/models/idefics3/image_processing_idefics3.py +43 -42
- transformers/models/idefics3/image_processing_idefics3_fast.py +15 -40
- transformers/models/idefics3/modeling_idefics3.py +90 -115
- transformers/models/idefics3/processing_idefics3.py +69 -15
- transformers/models/ijepa/configuration_ijepa.py +1 -0
- transformers/models/ijepa/modeling_ijepa.py +11 -10
- transformers/models/ijepa/modular_ijepa.py +7 -5
- transformers/models/imagegpt/configuration_imagegpt.py +2 -9
- transformers/models/imagegpt/image_processing_imagegpt.py +18 -17
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +16 -11
- transformers/models/imagegpt/modeling_imagegpt.py +65 -76
- transformers/models/informer/configuration_informer.py +9 -6
- transformers/models/informer/modeling_informer.py +86 -88
- transformers/models/informer/modular_informer.py +16 -14
- transformers/models/instructblip/configuration_instructblip.py +2 -2
- transformers/models/instructblip/modeling_instructblip.py +63 -103
- transformers/models/instructblip/processing_instructblip.py +36 -10
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -2
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +139 -157
- transformers/models/instructblipvideo/modular_instructblipvideo.py +64 -73
- transformers/models/instructblipvideo/processing_instructblipvideo.py +33 -14
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +8 -6
- transformers/models/internvl/configuration_internvl.py +1 -0
- transformers/models/internvl/modeling_internvl.py +106 -85
- transformers/models/internvl/modular_internvl.py +67 -47
- transformers/models/internvl/processing_internvl.py +45 -12
- transformers/models/internvl/video_processing_internvl.py +12 -10
- transformers/models/jamba/configuration_jamba.py +8 -5
- transformers/models/jamba/modeling_jamba.py +66 -68
- transformers/models/jamba/modular_jamba.py +55 -54
- transformers/models/janus/configuration_janus.py +1 -0
- transformers/models/janus/image_processing_janus.py +37 -35
- transformers/models/janus/image_processing_janus_fast.py +20 -18
- transformers/models/janus/modeling_janus.py +191 -115
- transformers/models/janus/modular_janus.py +84 -133
- transformers/models/janus/processing_janus.py +43 -17
- transformers/models/jetmoe/configuration_jetmoe.py +26 -24
- transformers/models/jetmoe/modeling_jetmoe.py +46 -43
- transformers/models/jetmoe/modular_jetmoe.py +33 -31
- transformers/models/kosmos2/configuration_kosmos2.py +9 -10
- transformers/models/kosmos2/modeling_kosmos2.py +173 -208
- transformers/models/kosmos2/processing_kosmos2.py +55 -40
- transformers/models/kosmos2_5/__init__.py +1 -0
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +9 -8
- transformers/models/kosmos2_5/image_processing_kosmos2_5.py +12 -10
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +13 -4
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +118 -132
- transformers/models/kosmos2_5/processing_kosmos2_5.py +29 -8
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +28 -31
- transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +14 -12
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +100 -110
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +22 -28
- transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +8 -2
- transformers/models/layoutlm/configuration_layoutlm.py +2 -14
- transformers/models/layoutlm/modeling_layoutlm.py +72 -77
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +17 -14
- transformers/models/layoutlmv2/image_processing_layoutlmv2.py +21 -18
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +9 -7
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +50 -64
- transformers/models/layoutlmv2/processing_layoutlmv2.py +44 -14
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +126 -73
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +19 -16
- transformers/models/layoutlmv3/image_processing_layoutlmv3.py +26 -24
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +11 -9
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +56 -82
- transformers/models/layoutlmv3/processing_layoutlmv3.py +46 -14
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +134 -74
- transformers/models/layoutxlm/configuration_layoutxlm.py +17 -14
- transformers/models/layoutxlm/modular_layoutxlm.py +1 -0
- transformers/models/layoutxlm/processing_layoutxlm.py +44 -14
- transformers/models/layoutxlm/tokenization_layoutxlm.py +113 -77
- transformers/models/led/configuration_led.py +12 -8
- transformers/models/led/modeling_led.py +266 -124
- transformers/models/levit/configuration_levit.py +1 -0
- transformers/models/levit/image_processing_levit.py +21 -19
- transformers/models/levit/image_processing_levit_fast.py +5 -4
- transformers/models/levit/modeling_levit.py +19 -38
- transformers/models/lfm2/configuration_lfm2.py +30 -27
- transformers/models/lfm2/modeling_lfm2.py +50 -47
- transformers/models/lfm2/modular_lfm2.py +30 -29
- transformers/models/lfm2_moe/__init__.py +1 -0
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +9 -6
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +53 -61
- transformers/models/lfm2_moe/modular_lfm2_moe.py +37 -13
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +1 -4
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +12 -41
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +66 -84
- transformers/models/lfm2_vl/modular_lfm2_vl.py +56 -70
- transformers/models/lfm2_vl/processing_lfm2_vl.py +76 -96
- transformers/models/lightglue/image_processing_lightglue.py +15 -16
- transformers/models/lightglue/image_processing_lightglue_fast.py +9 -9
- transformers/models/lightglue/modeling_lightglue.py +31 -31
- transformers/models/lightglue/modular_lightglue.py +28 -29
- transformers/models/lilt/configuration_lilt.py +2 -6
- transformers/models/lilt/modeling_lilt.py +70 -76
- transformers/models/llama/configuration_llama.py +31 -26
- transformers/models/llama/modeling_llama.py +39 -36
- transformers/models/llama/tokenization_llama.py +44 -14
- transformers/models/llama4/configuration_llama4.py +30 -27
- transformers/models/llama4/image_processing_llama4_fast.py +14 -12
- transformers/models/llama4/modeling_llama4.py +113 -120
- transformers/models/llama4/processing_llama4.py +57 -33
- transformers/models/llava/configuration_llava.py +1 -10
- transformers/models/llava/image_processing_llava.py +28 -25
- transformers/models/llava/image_processing_llava_fast.py +11 -9
- transformers/models/llava/modeling_llava.py +109 -85
- transformers/models/llava/processing_llava.py +51 -18
- transformers/models/llava_next/configuration_llava_next.py +2 -2
- transformers/models/llava_next/image_processing_llava_next.py +45 -43
- transformers/models/llava_next/image_processing_llava_next_fast.py +13 -11
- transformers/models/llava_next/modeling_llava_next.py +107 -110
- transformers/models/llava_next/processing_llava_next.py +47 -18
- transformers/models/llava_next_video/configuration_llava_next_video.py +7 -4
- transformers/models/llava_next_video/modeling_llava_next_video.py +158 -175
- transformers/models/llava_next_video/modular_llava_next_video.py +150 -155
- transformers/models/llava_next_video/processing_llava_next_video.py +63 -21
- transformers/models/llava_next_video/video_processing_llava_next_video.py +1 -0
- transformers/models/llava_onevision/configuration_llava_onevision.py +7 -4
- transformers/models/llava_onevision/image_processing_llava_onevision.py +42 -40
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +15 -14
- transformers/models/llava_onevision/modeling_llava_onevision.py +169 -177
- transformers/models/llava_onevision/modular_llava_onevision.py +156 -163
- transformers/models/llava_onevision/processing_llava_onevision.py +53 -21
- transformers/models/llava_onevision/video_processing_llava_onevision.py +1 -0
- transformers/models/longcat_flash/__init__.py +1 -0
- transformers/models/longcat_flash/configuration_longcat_flash.py +42 -37
- transformers/models/longcat_flash/modeling_longcat_flash.py +36 -36
- transformers/models/longcat_flash/modular_longcat_flash.py +21 -21
- transformers/models/longformer/configuration_longformer.py +5 -5
- transformers/models/longformer/modeling_longformer.py +101 -105
- transformers/models/longt5/configuration_longt5.py +7 -9
- transformers/models/longt5/modeling_longt5.py +49 -49
- transformers/models/luke/configuration_luke.py +2 -8
- transformers/models/luke/modeling_luke.py +181 -188
- transformers/models/luke/tokenization_luke.py +140 -107
- transformers/models/lxmert/configuration_lxmert.py +1 -16
- transformers/models/lxmert/modeling_lxmert.py +74 -65
- transformers/models/m2m_100/configuration_m2m_100.py +9 -7
- transformers/models/m2m_100/modeling_m2m_100.py +71 -83
- transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
- transformers/models/mamba/configuration_mamba.py +2 -1
- transformers/models/mamba/modeling_mamba.py +66 -58
- transformers/models/mamba2/configuration_mamba2.py +8 -5
- transformers/models/mamba2/modeling_mamba2.py +69 -68
- transformers/models/marian/configuration_marian.py +5 -10
- transformers/models/marian/modeling_marian.py +87 -93
- transformers/models/marian/tokenization_marian.py +6 -6
- transformers/models/markuplm/configuration_markuplm.py +7 -4
- transformers/models/markuplm/feature_extraction_markuplm.py +2 -1
- transformers/models/markuplm/modeling_markuplm.py +70 -69
- transformers/models/markuplm/processing_markuplm.py +38 -31
- transformers/models/markuplm/tokenization_markuplm.py +136 -93
- transformers/models/mask2former/configuration_mask2former.py +8 -5
- transformers/models/mask2former/image_processing_mask2former.py +85 -84
- transformers/models/mask2former/image_processing_mask2former_fast.py +40 -37
- transformers/models/mask2former/modeling_mask2former.py +103 -118
- transformers/models/mask2former/modular_mask2former.py +8 -6
- transformers/models/maskformer/configuration_maskformer.py +9 -6
- transformers/models/maskformer/configuration_maskformer_swin.py +1 -0
- transformers/models/maskformer/image_processing_maskformer.py +85 -84
- transformers/models/maskformer/image_processing_maskformer_fast.py +40 -36
- transformers/models/maskformer/modeling_maskformer.py +65 -79
- transformers/models/maskformer/modeling_maskformer_swin.py +32 -36
- transformers/models/mbart/configuration_mbart.py +4 -9
- transformers/models/mbart/modeling_mbart.py +116 -131
- transformers/models/mbart/tokenization_mbart.py +54 -11
- transformers/models/mbart50/tokenization_mbart50.py +13 -8
- transformers/models/megatron_bert/configuration_megatron_bert.py +3 -13
- transformers/models/megatron_bert/modeling_megatron_bert.py +150 -148
- transformers/models/metaclip_2/configuration_metaclip_2.py +1 -4
- transformers/models/metaclip_2/modeling_metaclip_2.py +84 -91
- transformers/models/metaclip_2/modular_metaclip_2.py +45 -61
- transformers/models/mgp_str/configuration_mgp_str.py +1 -0
- transformers/models/mgp_str/modeling_mgp_str.py +18 -20
- transformers/models/mgp_str/processing_mgp_str.py +20 -3
- transformers/models/mgp_str/tokenization_mgp_str.py +3 -1
- transformers/models/mimi/configuration_mimi.py +40 -42
- transformers/models/mimi/modeling_mimi.py +113 -142
- transformers/models/minimax/__init__.py +1 -0
- transformers/models/minimax/configuration_minimax.py +43 -37
- transformers/models/minimax/modeling_minimax.py +51 -61
- transformers/models/minimax/modular_minimax.py +62 -68
- transformers/models/ministral/configuration_ministral.py +29 -25
- transformers/models/ministral/modeling_ministral.py +38 -36
- transformers/models/ministral/modular_ministral.py +37 -32
- transformers/models/ministral3/configuration_ministral3.py +27 -24
- transformers/models/ministral3/modeling_ministral3.py +37 -36
- transformers/models/ministral3/modular_ministral3.py +5 -4
- transformers/models/mistral/configuration_mistral.py +29 -24
- transformers/models/mistral/modeling_mistral.py +37 -36
- transformers/models/mistral/modular_mistral.py +12 -11
- transformers/models/mistral3/configuration_mistral3.py +1 -4
- transformers/models/mistral3/modeling_mistral3.py +86 -89
- transformers/models/mistral3/modular_mistral3.py +68 -69
- transformers/models/mixtral/configuration_mixtral.py +34 -29
- transformers/models/mixtral/modeling_mixtral.py +45 -50
- transformers/models/mixtral/modular_mixtral.py +31 -32
- transformers/models/mlcd/configuration_mlcd.py +1 -0
- transformers/models/mlcd/modeling_mlcd.py +14 -20
- transformers/models/mlcd/modular_mlcd.py +13 -17
- transformers/models/mllama/configuration_mllama.py +15 -10
- transformers/models/mllama/image_processing_mllama.py +25 -23
- transformers/models/mllama/image_processing_mllama_fast.py +11 -11
- transformers/models/mllama/modeling_mllama.py +94 -105
- transformers/models/mllama/processing_mllama.py +55 -6
- transformers/models/mluke/tokenization_mluke.py +107 -101
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +3 -5
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +140 -155
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +3 -5
- transformers/models/mobilebert/configuration_mobilebert.py +2 -4
- transformers/models/mobilebert/modeling_mobilebert.py +85 -77
- transformers/models/mobilebert/tokenization_mobilebert.py +1 -0
- transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +1 -0
- transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +23 -20
- transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +1 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +16 -15
- transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +1 -0
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +51 -48
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +15 -13
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +22 -24
- transformers/models/mobilevit/configuration_mobilevit.py +1 -0
- transformers/models/mobilevit/image_processing_mobilevit.py +49 -46
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +14 -12
- transformers/models/mobilevit/modeling_mobilevit.py +21 -28
- transformers/models/mobilevitv2/configuration_mobilevitv2.py +1 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +22 -28
- transformers/models/modernbert/configuration_modernbert.py +42 -44
- transformers/models/modernbert/modeling_modernbert.py +133 -145
- transformers/models/modernbert/modular_modernbert.py +170 -186
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +40 -40
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +57 -62
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +86 -94
- transformers/models/moonshine/configuration_moonshine.py +31 -34
- transformers/models/moonshine/modeling_moonshine.py +71 -71
- transformers/models/moonshine/modular_moonshine.py +83 -88
- transformers/models/moshi/configuration_moshi.py +23 -46
- transformers/models/moshi/modeling_moshi.py +187 -157
- transformers/models/mpnet/configuration_mpnet.py +2 -6
- transformers/models/mpnet/modeling_mpnet.py +57 -62
- transformers/models/mpnet/tokenization_mpnet.py +15 -4
- transformers/models/mpt/configuration_mpt.py +9 -5
- transformers/models/mpt/modeling_mpt.py +60 -60
- transformers/models/mra/configuration_mra.py +2 -8
- transformers/models/mra/modeling_mra.py +57 -64
- transformers/models/mt5/configuration_mt5.py +8 -10
- transformers/models/mt5/modeling_mt5.py +95 -87
- transformers/models/musicgen/configuration_musicgen.py +8 -12
- transformers/models/musicgen/modeling_musicgen.py +122 -118
- transformers/models/musicgen/processing_musicgen.py +21 -3
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +8 -15
- transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +9 -8
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +123 -117
- transformers/models/musicgen_melody/processing_musicgen_melody.py +22 -3
- transformers/models/mvp/configuration_mvp.py +5 -8
- transformers/models/mvp/modeling_mvp.py +123 -135
- transformers/models/myt5/tokenization_myt5.py +10 -8
- transformers/models/nanochat/configuration_nanochat.py +8 -5
- transformers/models/nanochat/modeling_nanochat.py +40 -37
- transformers/models/nanochat/modular_nanochat.py +14 -12
- transformers/models/nemotron/configuration_nemotron.py +30 -25
- transformers/models/nemotron/modeling_nemotron.py +57 -56
- transformers/models/nllb/tokenization_nllb.py +28 -12
- transformers/models/nllb_moe/configuration_nllb_moe.py +9 -7
- transformers/models/nllb_moe/modeling_nllb_moe.py +69 -77
- transformers/models/nougat/image_processing_nougat.py +32 -29
- transformers/models/nougat/image_processing_nougat_fast.py +14 -12
- transformers/models/nougat/processing_nougat.py +39 -37
- transformers/models/nougat/tokenization_nougat.py +73 -18
- transformers/models/nystromformer/configuration_nystromformer.py +2 -8
- transformers/models/nystromformer/modeling_nystromformer.py +63 -74
- transformers/models/olmo/configuration_olmo.py +28 -23
- transformers/models/olmo/modeling_olmo.py +39 -36
- transformers/models/olmo/modular_olmo.py +11 -7
- transformers/models/olmo2/configuration_olmo2.py +28 -23
- transformers/models/olmo2/modeling_olmo2.py +41 -37
- transformers/models/olmo2/modular_olmo2.py +32 -29
- transformers/models/olmo3/__init__.py +1 -0
- transformers/models/olmo3/configuration_olmo3.py +30 -26
- transformers/models/olmo3/modeling_olmo3.py +39 -36
- transformers/models/olmo3/modular_olmo3.py +40 -37
- transformers/models/olmoe/configuration_olmoe.py +33 -29
- transformers/models/olmoe/modeling_olmoe.py +46 -52
- transformers/models/olmoe/modular_olmoe.py +15 -16
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +4 -2
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +47 -53
- transformers/models/omdet_turbo/processing_omdet_turbo.py +67 -19
- transformers/models/oneformer/configuration_oneformer.py +8 -5
- transformers/models/oneformer/image_processing_oneformer.py +84 -83
- transformers/models/oneformer/image_processing_oneformer_fast.py +42 -41
- transformers/models/oneformer/modeling_oneformer.py +171 -147
- transformers/models/oneformer/processing_oneformer.py +43 -28
- transformers/models/openai/configuration_openai.py +1 -16
- transformers/models/openai/modeling_openai.py +51 -65
- transformers/models/openai/tokenization_openai.py +47 -8
- transformers/models/opt/configuration_opt.py +7 -6
- transformers/models/opt/modeling_opt.py +76 -78
- transformers/models/ovis2/__init__.py +1 -0
- transformers/models/ovis2/configuration_ovis2.py +1 -0
- transformers/models/ovis2/image_processing_ovis2.py +24 -22
- transformers/models/ovis2/image_processing_ovis2_fast.py +11 -9
- transformers/models/ovis2/modeling_ovis2.py +142 -111
- transformers/models/ovis2/modular_ovis2.py +45 -90
- transformers/models/ovis2/processing_ovis2.py +40 -12
- transformers/models/owlv2/configuration_owlv2.py +2 -4
- transformers/models/owlv2/image_processing_owlv2.py +21 -20
- transformers/models/owlv2/image_processing_owlv2_fast.py +15 -12
- transformers/models/owlv2/modeling_owlv2.py +117 -133
- transformers/models/owlv2/modular_owlv2.py +14 -11
- transformers/models/owlv2/processing_owlv2.py +49 -20
- transformers/models/owlvit/configuration_owlvit.py +2 -4
- transformers/models/owlvit/image_processing_owlvit.py +22 -21
- transformers/models/owlvit/image_processing_owlvit_fast.py +3 -2
- transformers/models/owlvit/modeling_owlvit.py +116 -132
- transformers/models/owlvit/processing_owlvit.py +48 -20
- transformers/models/paligemma/configuration_paligemma.py +1 -4
- transformers/models/paligemma/modeling_paligemma.py +93 -103
- transformers/models/paligemma/processing_paligemma.py +66 -13
- transformers/models/parakeet/configuration_parakeet.py +14 -7
- transformers/models/parakeet/feature_extraction_parakeet.py +12 -10
- transformers/models/parakeet/modeling_parakeet.py +28 -32
- transformers/models/parakeet/modular_parakeet.py +20 -23
- transformers/models/parakeet/processing_parakeet.py +5 -13
- transformers/models/parakeet/{tokenization_parakeet.py → tokenization_parakeet_fast.py} +7 -5
- transformers/models/patchtsmixer/configuration_patchtsmixer.py +8 -5
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +62 -70
- transformers/models/patchtst/configuration_patchtst.py +9 -6
- transformers/models/patchtst/modeling_patchtst.py +80 -97
- transformers/models/pegasus/configuration_pegasus.py +5 -8
- transformers/models/pegasus/modeling_pegasus.py +66 -72
- transformers/models/pegasus/tokenization_pegasus.py +45 -15
- transformers/models/pegasus_x/configuration_pegasus_x.py +4 -5
- transformers/models/pegasus_x/modeling_pegasus_x.py +52 -55
- transformers/models/perceiver/configuration_perceiver.py +1 -0
- transformers/models/perceiver/image_processing_perceiver.py +25 -22
- transformers/models/perceiver/image_processing_perceiver_fast.py +9 -7
- transformers/models/perceiver/modeling_perceiver.py +146 -165
- transformers/models/perceiver/tokenization_perceiver.py +6 -3
- transformers/models/perception_lm/configuration_perception_lm.py +1 -0
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +10 -8
- transformers/models/perception_lm/modeling_perception_lm.py +70 -71
- transformers/models/perception_lm/modular_perception_lm.py +61 -65
- transformers/models/perception_lm/processing_perception_lm.py +47 -13
- transformers/models/perception_lm/video_processing_perception_lm.py +1 -0
- transformers/models/persimmon/configuration_persimmon.py +28 -23
- transformers/models/persimmon/modeling_persimmon.py +45 -43
- transformers/models/phi/configuration_phi.py +28 -23
- transformers/models/phi/modeling_phi.py +43 -40
- transformers/models/phi/modular_phi.py +24 -23
- transformers/models/phi3/configuration_phi3.py +33 -28
- transformers/models/phi3/modeling_phi3.py +38 -36
- transformers/models/phi3/modular_phi3.py +17 -13
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +33 -30
- transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +9 -7
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +11 -11
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +78 -95
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +80 -98
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +44 -7
- transformers/models/phimoe/configuration_phimoe.py +36 -31
- transformers/models/phimoe/modeling_phimoe.py +45 -50
- transformers/models/phimoe/modular_phimoe.py +4 -3
- transformers/models/phobert/tokenization_phobert.py +6 -4
- transformers/models/pix2struct/configuration_pix2struct.py +10 -12
- transformers/models/pix2struct/image_processing_pix2struct.py +19 -15
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +15 -12
- transformers/models/pix2struct/modeling_pix2struct.py +52 -58
- transformers/models/pix2struct/processing_pix2struct.py +30 -5
- transformers/models/pixtral/configuration_pixtral.py +14 -11
- transformers/models/pixtral/image_processing_pixtral.py +28 -26
- transformers/models/pixtral/image_processing_pixtral_fast.py +11 -10
- transformers/models/pixtral/modeling_pixtral.py +34 -28
- transformers/models/pixtral/processing_pixtral.py +53 -21
- transformers/models/plbart/configuration_plbart.py +5 -8
- transformers/models/plbart/modeling_plbart.py +106 -119
- transformers/models/plbart/modular_plbart.py +33 -39
- transformers/models/plbart/tokenization_plbart.py +7 -4
- transformers/models/poolformer/configuration_poolformer.py +1 -0
- transformers/models/poolformer/image_processing_poolformer.py +24 -21
- transformers/models/poolformer/image_processing_poolformer_fast.py +15 -13
- transformers/models/poolformer/modeling_poolformer.py +13 -23
- transformers/models/pop2piano/configuration_pop2piano.py +8 -7
- transformers/models/pop2piano/feature_extraction_pop2piano.py +9 -6
- transformers/models/pop2piano/modeling_pop2piano.py +24 -26
- transformers/models/pop2piano/processing_pop2piano.py +33 -25
- transformers/models/pop2piano/tokenization_pop2piano.py +23 -15
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +3 -3
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +21 -20
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +13 -16
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +13 -16
- transformers/models/prophetnet/configuration_prophetnet.py +38 -37
- transformers/models/prophetnet/modeling_prophetnet.py +131 -114
- transformers/models/prophetnet/tokenization_prophetnet.py +16 -14
- transformers/models/pvt/configuration_pvt.py +1 -0
- transformers/models/pvt/image_processing_pvt.py +27 -24
- transformers/models/pvt/image_processing_pvt_fast.py +2 -1
- transformers/models/pvt/modeling_pvt.py +21 -21
- transformers/models/pvt_v2/configuration_pvt_v2.py +4 -2
- transformers/models/pvt_v2/modeling_pvt_v2.py +25 -28
- transformers/models/qwen2/configuration_qwen2.py +25 -32
- transformers/models/qwen2/modeling_qwen2.py +38 -36
- transformers/models/qwen2/modular_qwen2.py +12 -11
- transformers/models/qwen2/tokenization_qwen2.py +23 -12
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +26 -32
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +277 -340
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +211 -278
- transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +49 -41
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +35 -29
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +148 -203
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +118 -93
- transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +43 -7
- transformers/models/qwen2_audio/configuration_qwen2_audio.py +1 -0
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +40 -40
- transformers/models/qwen2_audio/processing_qwen2_audio.py +42 -13
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +35 -42
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +46 -51
- transformers/models/qwen2_moe/modular_qwen2_moe.py +10 -7
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +34 -29
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +42 -41
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +15 -12
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +153 -199
- transformers/models/qwen2_vl/processing_qwen2_vl.py +44 -7
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +18 -38
- transformers/models/qwen3/configuration_qwen3.py +27 -34
- transformers/models/qwen3/modeling_qwen3.py +39 -36
- transformers/models/qwen3/modular_qwen3.py +6 -4
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +32 -39
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +46 -51
- transformers/models/qwen3_moe/modular_qwen3_moe.py +13 -10
- transformers/models/qwen3_next/configuration_qwen3_next.py +35 -45
- transformers/models/qwen3_next/modeling_qwen3_next.py +51 -47
- transformers/models/qwen3_next/modular_qwen3_next.py +35 -34
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +101 -135
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +252 -355
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +196 -250
- transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +48 -40
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +29 -27
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +155 -233
- transformers/models/qwen3_vl/modular_qwen3_vl.py +179 -206
- transformers/models/qwen3_vl/processing_qwen3_vl.py +42 -6
- transformers/models/qwen3_vl/video_processing_qwen3_vl.py +12 -10
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +30 -23
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +303 -358
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +124 -87
- transformers/models/rag/configuration_rag.py +15 -6
- transformers/models/rag/modeling_rag.py +130 -127
- transformers/models/rag/retrieval_rag.py +5 -3
- transformers/models/rag/tokenization_rag.py +50 -0
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +30 -29
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +42 -53
- transformers/models/reformer/configuration_reformer.py +8 -7
- transformers/models/reformer/modeling_reformer.py +69 -80
- transformers/models/reformer/tokenization_reformer.py +31 -11
- transformers/models/regnet/configuration_regnet.py +1 -0
- transformers/models/regnet/modeling_regnet.py +8 -15
- transformers/models/rembert/configuration_rembert.py +2 -8
- transformers/models/rembert/modeling_rembert.py +111 -121
- transformers/models/rembert/tokenization_rembert.py +12 -2
- transformers/models/resnet/configuration_resnet.py +1 -0
- transformers/models/resnet/modeling_resnet.py +13 -27
- transformers/models/roberta/configuration_roberta.py +3 -11
- transformers/models/roberta/modeling_roberta.py +93 -94
- transformers/models/roberta/modular_roberta.py +58 -58
- transformers/models/roberta/tokenization_roberta.py +29 -17
- transformers/models/roberta/tokenization_roberta_old.py +4 -2
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +3 -11
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +93 -94
- transformers/models/roc_bert/configuration_roc_bert.py +2 -8
- transformers/models/roc_bert/modeling_roc_bert.py +121 -122
- transformers/models/roc_bert/tokenization_roc_bert.py +94 -88
- transformers/models/roformer/configuration_roformer.py +3 -13
- transformers/models/roformer/modeling_roformer.py +81 -85
- transformers/models/roformer/tokenization_roformer.py +412 -74
- transformers/models/roformer/tokenization_roformer_fast.py +160 -0
- transformers/models/roformer/tokenization_utils.py +1 -0
- transformers/models/rt_detr/configuration_rt_detr.py +2 -1
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +1 -0
- transformers/models/rt_detr/image_processing_rt_detr.py +55 -54
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +26 -26
- transformers/models/rt_detr/modeling_rt_detr.py +90 -99
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +6 -13
- transformers/models/rt_detr/modular_rt_detr.py +16 -16
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +4 -6
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +90 -101
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +12 -19
- transformers/models/rwkv/configuration_rwkv.py +4 -2
- transformers/models/rwkv/modeling_rwkv.py +32 -31
- transformers/models/sam/configuration_sam.py +1 -3
- transformers/models/sam/image_processing_sam.py +60 -59
- transformers/models/sam/image_processing_sam_fast.py +27 -25
- transformers/models/sam/modeling_sam.py +41 -47
- transformers/models/sam/processing_sam.py +27 -39
- transformers/models/sam2/configuration_sam2.py +3 -2
- transformers/models/sam2/image_processing_sam2_fast.py +15 -14
- transformers/models/sam2/modeling_sam2.py +90 -96
- transformers/models/sam2/modular_sam2.py +91 -86
- transformers/models/sam2/processing_sam2.py +47 -31
- transformers/models/sam2_video/configuration_sam2_video.py +1 -0
- transformers/models/sam2_video/modeling_sam2_video.py +144 -151
- transformers/models/sam2_video/modular_sam2_video.py +104 -101
- transformers/models/sam2_video/processing_sam2_video.py +66 -49
- transformers/models/sam2_video/video_processing_sam2_video.py +4 -1
- transformers/models/sam3/configuration_sam3.py +2 -21
- transformers/models/sam3/image_processing_sam3_fast.py +20 -17
- transformers/models/sam3/modeling_sam3.py +170 -184
- transformers/models/sam3/modular_sam3.py +8 -3
- transformers/models/sam3/processing_sam3.py +52 -37
- transformers/models/sam3_tracker/__init__.py +1 -0
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +3 -1
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +77 -82
- transformers/models/sam3_tracker/modular_sam3_tracker.py +3 -8
- transformers/models/sam3_tracker/processing_sam3_tracker.py +48 -31
- transformers/models/sam3_tracker_video/__init__.py +1 -0
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +1 -25
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +122 -135
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +26 -35
- transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +66 -50
- transformers/models/sam3_video/configuration_sam3_video.py +1 -14
- transformers/models/sam3_video/modeling_sam3_video.py +34 -33
- transformers/models/sam3_video/processing_sam3_video.py +46 -26
- transformers/models/sam_hq/__init__.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +1 -3
- transformers/models/sam_hq/modeling_sam_hq.py +69 -74
- transformers/models/sam_hq/modular_sam_hq.py +25 -23
- transformers/models/sam_hq/{processing_sam_hq.py → processing_samhq.py} +29 -41
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +10 -8
- transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +11 -8
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +194 -212
- transformers/models/seamless_m4t/processing_seamless_m4t.py +39 -18
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +77 -40
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +10 -8
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +196 -204
- transformers/models/seed_oss/configuration_seed_oss.py +32 -28
- transformers/models/seed_oss/modeling_seed_oss.py +35 -33
- transformers/models/seed_oss/modular_seed_oss.py +4 -3
- transformers/models/segformer/configuration_segformer.py +10 -0
- transformers/models/segformer/image_processing_segformer.py +42 -39
- transformers/models/segformer/image_processing_segformer_fast.py +12 -10
- transformers/models/segformer/modeling_segformer.py +31 -34
- transformers/models/segformer/modular_segformer.py +10 -8
- transformers/models/seggpt/configuration_seggpt.py +1 -0
- transformers/models/seggpt/image_processing_seggpt.py +41 -38
- transformers/models/seggpt/modeling_seggpt.py +38 -50
- transformers/models/sew/configuration_sew.py +2 -4
- transformers/models/sew/modeling_sew.py +36 -38
- transformers/models/sew/modular_sew.py +13 -13
- transformers/models/sew_d/configuration_sew_d.py +2 -4
- transformers/models/sew_d/modeling_sew_d.py +30 -31
- transformers/models/shieldgemma2/configuration_shieldgemma2.py +1 -0
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +17 -16
- transformers/models/shieldgemma2/processing_shieldgemma2.py +5 -3
- transformers/models/siglip/configuration_siglip.py +2 -4
- transformers/models/siglip/image_processing_siglip.py +20 -17
- transformers/models/siglip/image_processing_siglip_fast.py +1 -0
- transformers/models/siglip/modeling_siglip.py +75 -84
- transformers/models/siglip/processing_siglip.py +14 -2
- transformers/models/siglip/tokenization_siglip.py +7 -6
- transformers/models/siglip2/configuration_siglip2.py +2 -5
- transformers/models/siglip2/image_processing_siglip2.py +16 -15
- transformers/models/siglip2/image_processing_siglip2_fast.py +7 -6
- transformers/models/siglip2/modeling_siglip2.py +129 -143
- transformers/models/siglip2/modular_siglip2.py +46 -47
- transformers/models/siglip2/processing_siglip2.py +14 -2
- transformers/models/smollm3/configuration_smollm3.py +32 -29
- transformers/models/smollm3/modeling_smollm3.py +39 -36
- transformers/models/smollm3/modular_smollm3.py +35 -33
- transformers/models/smolvlm/configuration_smolvlm.py +4 -2
- transformers/models/smolvlm/image_processing_smolvlm.py +43 -42
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +15 -41
- transformers/models/smolvlm/modeling_smolvlm.py +94 -126
- transformers/models/smolvlm/modular_smolvlm.py +39 -50
- transformers/models/smolvlm/processing_smolvlm.py +83 -15
- transformers/models/smolvlm/video_processing_smolvlm.py +18 -16
- transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +1 -0
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +27 -26
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -9
- transformers/models/speech_to_text/feature_extraction_speech_to_text.py +13 -10
- transformers/models/speech_to_text/modeling_speech_to_text.py +54 -66
- transformers/models/speech_to_text/processing_speech_to_text.py +30 -4
- transformers/models/speech_to_text/tokenization_speech_to_text.py +6 -5
- transformers/models/speecht5/configuration_speecht5.py +9 -7
- transformers/models/speecht5/feature_extraction_speecht5.py +37 -16
- transformers/models/speecht5/modeling_speecht5.py +175 -213
- transformers/models/speecht5/number_normalizer.py +1 -0
- transformers/models/speecht5/processing_speecht5.py +37 -3
- transformers/models/speecht5/tokenization_speecht5.py +5 -4
- transformers/models/splinter/configuration_splinter.py +7 -6
- transformers/models/splinter/modeling_splinter.py +59 -71
- transformers/models/splinter/tokenization_splinter.py +30 -9
- transformers/models/squeezebert/configuration_squeezebert.py +2 -14
- transformers/models/squeezebert/modeling_squeezebert.py +62 -68
- transformers/models/squeezebert/tokenization_squeezebert.py +1 -0
- transformers/models/stablelm/configuration_stablelm.py +29 -24
- transformers/models/stablelm/modeling_stablelm.py +45 -44
- transformers/models/starcoder2/configuration_starcoder2.py +27 -30
- transformers/models/starcoder2/modeling_starcoder2.py +41 -39
- transformers/models/starcoder2/modular_starcoder2.py +16 -14
- transformers/models/superglue/configuration_superglue.py +3 -7
- transformers/models/superglue/image_processing_superglue.py +15 -15
- transformers/models/superglue/image_processing_superglue_fast.py +10 -9
- transformers/models/superglue/modeling_superglue.py +37 -42
- transformers/models/superpoint/image_processing_superpoint.py +15 -15
- transformers/models/superpoint/image_processing_superpoint_fast.py +11 -8
- transformers/models/superpoint/modeling_superpoint.py +16 -18
- transformers/models/swiftformer/configuration_swiftformer.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +14 -18
- transformers/models/swin/configuration_swin.py +1 -0
- transformers/models/swin/modeling_swin.py +86 -86
- transformers/models/swin2sr/configuration_swin2sr.py +1 -0
- transformers/models/swin2sr/image_processing_swin2sr.py +13 -10
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +8 -4
- transformers/models/swin2sr/modeling_swin2sr.py +63 -81
- transformers/models/swinv2/configuration_swinv2.py +1 -0
- transformers/models/swinv2/modeling_swinv2.py +104 -108
- transformers/models/switch_transformers/configuration_switch_transformers.py +7 -11
- transformers/models/switch_transformers/modeling_switch_transformers.py +44 -37
- transformers/models/switch_transformers/modular_switch_transformers.py +41 -34
- transformers/models/t5/configuration_t5.py +8 -14
- transformers/models/t5/modeling_t5.py +92 -88
- transformers/models/t5/tokenization_t5.py +9 -3
- transformers/models/t5gemma/configuration_t5gemma.py +41 -43
- transformers/models/t5gemma/modeling_t5gemma.py +107 -104
- transformers/models/t5gemma/modular_t5gemma.py +120 -124
- transformers/models/t5gemma2/configuration_t5gemma2.py +120 -80
- transformers/models/t5gemma2/modeling_t5gemma2.py +125 -141
- transformers/models/t5gemma2/modular_t5gemma2.py +104 -393
- transformers/models/table_transformer/configuration_table_transformer.py +2 -1
- transformers/models/table_transformer/modeling_table_transformer.py +49 -51
- transformers/models/tapas/configuration_tapas.py +2 -12
- transformers/models/tapas/modeling_tapas.py +67 -68
- transformers/models/tapas/tokenization_tapas.py +153 -115
- transformers/models/textnet/configuration_textnet.py +1 -0
- transformers/models/textnet/image_processing_textnet.py +25 -22
- transformers/models/textnet/image_processing_textnet_fast.py +10 -8
- transformers/models/textnet/modeling_textnet.py +16 -28
- transformers/models/time_series_transformer/configuration_time_series_transformer.py +8 -5
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +81 -83
- transformers/models/timesfm/configuration_timesfm.py +1 -0
- transformers/models/timesfm/modeling_timesfm.py +22 -33
- transformers/models/timesfm/modular_timesfm.py +21 -32
- transformers/models/timesformer/configuration_timesformer.py +1 -0
- transformers/models/timesformer/modeling_timesformer.py +16 -15
- transformers/models/timm_backbone/configuration_timm_backbone.py +1 -0
- transformers/models/timm_backbone/modeling_timm_backbone.py +15 -17
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -5
- transformers/models/timm_wrapper/image_processing_timm_wrapper.py +5 -4
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +29 -34
- transformers/models/trocr/configuration_trocr.py +8 -11
- transformers/models/trocr/modeling_trocr.py +44 -45
- transformers/models/trocr/processing_trocr.py +25 -5
- transformers/models/tvp/configuration_tvp.py +2 -5
- transformers/models/tvp/image_processing_tvp.py +52 -50
- transformers/models/tvp/image_processing_tvp_fast.py +15 -15
- transformers/models/tvp/modeling_tvp.py +27 -27
- transformers/models/tvp/processing_tvp.py +14 -2
- transformers/models/udop/configuration_udop.py +7 -16
- transformers/models/udop/modeling_udop.py +73 -71
- transformers/models/udop/processing_udop.py +26 -7
- transformers/models/udop/tokenization_udop.py +105 -84
- transformers/models/umt5/configuration_umt5.py +7 -8
- transformers/models/umt5/modeling_umt5.py +90 -94
- transformers/models/unispeech/configuration_unispeech.py +2 -4
- transformers/models/unispeech/modeling_unispeech.py +49 -51
- transformers/models/unispeech/modular_unispeech.py +22 -22
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +2 -4
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +65 -69
- transformers/models/unispeech_sat/modular_unispeech_sat.py +23 -23
- transformers/models/univnet/feature_extraction_univnet.py +14 -14
- transformers/models/univnet/modeling_univnet.py +8 -8
- transformers/models/upernet/configuration_upernet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +13 -11
- transformers/models/vaultgemma/__init__.py +1 -0
- transformers/models/vaultgemma/configuration_vaultgemma.py +33 -29
- transformers/models/vaultgemma/modeling_vaultgemma.py +41 -39
- transformers/models/vaultgemma/modular_vaultgemma.py +31 -29
- transformers/models/video_llama_3/configuration_video_llama_3.py +0 -4
- transformers/models/video_llama_3/image_processing_video_llama_3.py +42 -43
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +14 -12
- transformers/models/video_llama_3/modeling_video_llama_3.py +109 -157
- transformers/models/video_llama_3/modular_video_llama_3.py +146 -155
- transformers/models/video_llama_3/processing_video_llama_3.py +39 -5
- transformers/models/video_llama_3/video_processing_video_llama_3.py +23 -42
- transformers/models/video_llava/configuration_video_llava.py +1 -4
- transformers/models/video_llava/image_processing_video_llava.py +38 -35
- transformers/models/video_llava/modeling_video_llava.py +146 -146
- transformers/models/video_llava/processing_video_llava.py +78 -38
- transformers/models/video_llava/video_processing_video_llava.py +1 -0
- transformers/models/videomae/configuration_videomae.py +1 -0
- transformers/models/videomae/image_processing_videomae.py +34 -31
- transformers/models/videomae/modeling_videomae.py +17 -14
- transformers/models/videomae/video_processing_videomae.py +1 -0
- transformers/models/vilt/configuration_vilt.py +4 -6
- transformers/models/vilt/image_processing_vilt.py +30 -29
- transformers/models/vilt/image_processing_vilt_fast.py +16 -15
- transformers/models/vilt/modeling_vilt.py +90 -116
- transformers/models/vilt/processing_vilt.py +14 -2
- transformers/models/vipllava/configuration_vipllava.py +1 -4
- transformers/models/vipllava/modeling_vipllava.py +70 -99
- transformers/models/vipllava/modular_vipllava.py +54 -78
- transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +1 -0
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +27 -28
- transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +1 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +41 -46
- transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +16 -2
- transformers/models/visual_bert/configuration_visual_bert.py +2 -6
- transformers/models/visual_bert/modeling_visual_bert.py +92 -98
- transformers/models/vit/configuration_vit.py +1 -0
- transformers/models/vit/image_processing_vit.py +22 -19
- transformers/models/vit/image_processing_vit_fast.py +1 -0
- transformers/models/vit/modeling_vit.py +17 -17
- transformers/models/vit_mae/configuration_vit_mae.py +1 -0
- transformers/models/vit_mae/modeling_vit_mae.py +27 -29
- transformers/models/vit_msn/configuration_vit_msn.py +1 -0
- transformers/models/vit_msn/modeling_vit_msn.py +16 -18
- transformers/models/vitdet/configuration_vitdet.py +1 -0
- transformers/models/vitdet/modeling_vitdet.py +14 -14
- transformers/models/vitmatte/configuration_vitmatte.py +5 -2
- transformers/models/vitmatte/image_processing_vitmatte.py +18 -15
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +18 -16
- transformers/models/vitmatte/modeling_vitmatte.py +11 -14
- transformers/models/vitpose/configuration_vitpose.py +7 -4
- transformers/models/vitpose/image_processing_vitpose.py +25 -24
- transformers/models/vitpose/image_processing_vitpose_fast.py +11 -9
- transformers/models/vitpose/modeling_vitpose.py +14 -14
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +1 -0
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +10 -8
- transformers/models/vits/configuration_vits.py +1 -4
- transformers/models/vits/modeling_vits.py +42 -44
- transformers/models/vits/tokenization_vits.py +4 -3
- transformers/models/vivit/configuration_vivit.py +1 -0
- transformers/models/vivit/image_processing_vivit.py +39 -36
- transformers/models/vivit/modeling_vivit.py +8 -6
- transformers/models/vjepa2/__init__.py +1 -0
- transformers/models/vjepa2/configuration_vjepa2.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +32 -31
- transformers/models/vjepa2/video_processing_vjepa2.py +1 -0
- transformers/models/voxtral/__init__.py +1 -0
- transformers/models/voxtral/configuration_voxtral.py +2 -0
- transformers/models/voxtral/modeling_voxtral.py +47 -40
- transformers/models/voxtral/modular_voxtral.py +40 -37
- transformers/models/voxtral/processing_voxtral.py +48 -25
- transformers/models/wav2vec2/configuration_wav2vec2.py +2 -4
- transformers/models/wav2vec2/feature_extraction_wav2vec2.py +10 -7
- transformers/models/wav2vec2/modeling_wav2vec2.py +121 -73
- transformers/models/wav2vec2/processing_wav2vec2.py +35 -6
- transformers/models/wav2vec2/tokenization_wav2vec2.py +332 -20
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +2 -4
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +62 -70
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +48 -57
- transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +35 -6
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +2 -4
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +77 -90
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +30 -37
- transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +17 -16
- transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +55 -36
- transformers/models/wavlm/configuration_wavlm.py +2 -4
- transformers/models/wavlm/modeling_wavlm.py +48 -50
- transformers/models/wavlm/modular_wavlm.py +5 -4
- transformers/models/whisper/configuration_whisper.py +5 -6
- transformers/models/whisper/english_normalizer.py +4 -3
- transformers/models/whisper/feature_extraction_whisper.py +24 -9
- transformers/models/whisper/generation_whisper.py +48 -26
- transformers/models/whisper/modeling_whisper.py +73 -79
- transformers/models/whisper/processing_whisper.py +20 -3
- transformers/models/whisper/tokenization_whisper.py +43 -11
- transformers/models/x_clip/configuration_x_clip.py +2 -4
- transformers/models/x_clip/modeling_x_clip.py +93 -96
- transformers/models/x_clip/processing_x_clip.py +14 -2
- transformers/models/xcodec/configuration_xcodec.py +6 -4
- transformers/models/xcodec/modeling_xcodec.py +17 -20
- transformers/models/xglm/configuration_xglm.py +8 -9
- transformers/models/xglm/modeling_xglm.py +55 -60
- transformers/models/xglm/tokenization_xglm.py +11 -3
- transformers/models/xlm/configuration_xlm.py +8 -10
- transformers/models/xlm/modeling_xlm.py +144 -144
- transformers/models/xlm/tokenization_xlm.py +5 -3
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +3 -11
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +194 -195
- transformers/models/xlm_roberta/modular_xlm_roberta.py +53 -50
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +18 -8
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +2 -10
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +93 -94
- transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +70 -67
- transformers/models/xlnet/configuration_xlnet.py +12 -3
- transformers/models/xlnet/modeling_xlnet.py +163 -152
- transformers/models/xlnet/tokenization_xlnet.py +9 -2
- transformers/models/xlstm/configuration_xlstm.py +12 -8
- transformers/models/xlstm/modeling_xlstm.py +65 -62
- transformers/models/xmod/configuration_xmod.py +3 -11
- transformers/models/xmod/modeling_xmod.py +110 -108
- transformers/models/yolos/configuration_yolos.py +1 -0
- transformers/models/yolos/image_processing_yolos.py +62 -60
- transformers/models/yolos/image_processing_yolos_fast.py +45 -42
- transformers/models/yolos/modeling_yolos.py +16 -16
- transformers/models/yolos/modular_yolos.py +19 -17
- transformers/models/yoso/configuration_yoso.py +2 -8
- transformers/models/yoso/modeling_yoso.py +63 -70
- transformers/models/zamba/configuration_zamba.py +8 -5
- transformers/models/zamba/modeling_zamba.py +78 -81
- transformers/models/zamba2/configuration_zamba2.py +50 -44
- transformers/models/zamba2/modeling_zamba2.py +97 -97
- transformers/models/zamba2/modular_zamba2.py +48 -46
- transformers/models/zoedepth/configuration_zoedepth.py +2 -1
- transformers/models/zoedepth/image_processing_zoedepth.py +29 -28
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +24 -21
- transformers/models/zoedepth/modeling_zoedepth.py +18 -26
- transformers/pipelines/__init__.py +114 -57
- transformers/pipelines/any_to_any.py +22 -14
- transformers/pipelines/audio_utils.py +2 -1
- transformers/pipelines/automatic_speech_recognition.py +12 -20
- transformers/pipelines/base.py +27 -15
- transformers/{models/pe_audio/processing_pe_audio.py → pipelines/deprecated/__init__.py} +3 -10
- transformers/pipelines/deprecated/text2text_generation.py +408 -0
- transformers/pipelines/document_question_answering.py +2 -4
- transformers/pipelines/image_text_to_text.py +1 -0
- transformers/pipelines/image_to_text.py +229 -0
- transformers/pipelines/question_answering.py +44 -5
- transformers/pipelines/text_classification.py +14 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/pipelines/text_to_audio.py +2 -2
- transformers/pipelines/token_classification.py +22 -1
- transformers/pipelines/video_classification.py +9 -1
- transformers/pipelines/zero_shot_audio_classification.py +1 -0
- transformers/pipelines/zero_shot_classification.py +6 -0
- transformers/pipelines/zero_shot_image_classification.py +7 -0
- transformers/processing_utils.py +145 -230
- transformers/quantizers/auto.py +4 -2
- transformers/quantizers/base.py +173 -53
- transformers/quantizers/quantizer_aqlm.py +23 -2
- transformers/quantizers/quantizer_auto_round.py +12 -2
- transformers/quantizers/quantizer_awq.py +89 -20
- transformers/quantizers/quantizer_bitnet.py +14 -4
- transformers/quantizers/quantizer_bnb_4bit.py +155 -18
- transformers/quantizers/quantizer_bnb_8bit.py +110 -24
- transformers/quantizers/quantizer_compressed_tensors.py +9 -2
- transformers/quantizers/quantizer_eetq.py +74 -16
- transformers/quantizers/quantizer_fbgemm_fp8.py +138 -38
- transformers/quantizers/quantizer_finegrained_fp8.py +113 -26
- transformers/quantizers/quantizer_fp_quant.py +82 -52
- transformers/quantizers/quantizer_gptq.py +28 -8
- transformers/quantizers/quantizer_higgs.py +60 -42
- transformers/quantizers/quantizer_hqq.py +153 -144
- transformers/quantizers/quantizer_mxfp4.py +194 -14
- transformers/quantizers/quantizer_quanto.py +79 -35
- transformers/quantizers/quantizer_quark.py +18 -36
- transformers/quantizers/quantizer_spqr.py +12 -4
- transformers/quantizers/quantizer_torchao.py +325 -50
- transformers/quantizers/quantizer_vptq.py +27 -4
- transformers/quantizers/quantizers_utils.py +0 -20
- transformers/safetensors_conversion.py +3 -9
- transformers/testing_utils.py +82 -326
- transformers/tokenization_mistral_common.py +903 -568
- transformers/tokenization_utils_base.py +340 -220
- transformers/tokenization_utils_sentencepiece.py +6 -5
- transformers/tokenization_utils_tokenizers.py +113 -226
- transformers/trainer.py +53 -60
- transformers/trainer_callback.py +0 -8
- transformers/trainer_seq2seq.py +1 -5
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +41 -77
- transformers/utils/__init__.py +4 -8
- transformers/utils/attention_visualizer.py +5 -5
- transformers/utils/auto_docstring.py +37 -599
- transformers/utils/doc.py +36 -4
- transformers/utils/dummy_pt_objects.py +42 -0
- transformers/utils/generic.py +28 -111
- transformers/utils/hub.py +15 -5
- transformers/utils/import_utils.py +32 -165
- transformers/utils/kernel_config.py +19 -74
- transformers/utils/loading_report.py +15 -25
- transformers/utils/quantization_config.py +241 -72
- transformers/video_processing_utils.py +39 -41
- transformers/video_utils.py +22 -18
- {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/METADATA +236 -284
- transformers-5.0.0rc0.dist-info/RECORD +1987 -0
- {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/WHEEL +1 -1
- transformers/integrations/moe.py +0 -360
- transformers/integrations/quark.py +0 -53
- transformers/loss/loss_lw_detr.py +0 -356
- transformers/models/ernie4_5_vl_moe/__init__.py +0 -31
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +0 -340
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +0 -455
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +0 -231
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +0 -1936
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +0 -1925
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +0 -249
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +0 -593
- transformers/models/fast_vlm/__init__.py +0 -27
- transformers/models/fast_vlm/configuration_fast_vlm.py +0 -137
- transformers/models/fast_vlm/modeling_fast_vlm.py +0 -432
- transformers/models/fast_vlm/modular_fast_vlm.py +0 -373
- transformers/models/glm4_moe_lite/__init__.py +0 -28
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +0 -233
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +0 -740
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +0 -302
- transformers/models/glm_image/__init__.py +0 -31
- transformers/models/glm_image/configuration_glm_image.py +0 -351
- transformers/models/glm_image/image_processing_glm_image.py +0 -503
- transformers/models/glm_image/image_processing_glm_image_fast.py +0 -294
- transformers/models/glm_image/modeling_glm_image.py +0 -1642
- transformers/models/glm_image/modular_glm_image.py +0 -1531
- transformers/models/glm_image/processing_glm_image.py +0 -217
- transformers/models/glmasr/__init__.py +0 -29
- transformers/models/glmasr/configuration_glmasr.py +0 -196
- transformers/models/glmasr/modeling_glmasr.py +0 -517
- transformers/models/glmasr/modular_glmasr.py +0 -443
- transformers/models/glmasr/processing_glmasr.py +0 -331
- transformers/models/jais2/__init__.py +0 -27
- transformers/models/jais2/configuration_jais2.py +0 -148
- transformers/models/jais2/modeling_jais2.py +0 -484
- transformers/models/jais2/modular_jais2.py +0 -194
- transformers/models/lasr/__init__.py +0 -29
- transformers/models/lasr/configuration_lasr.py +0 -244
- transformers/models/lasr/feature_extraction_lasr.py +0 -275
- transformers/models/lasr/modeling_lasr.py +0 -727
- transformers/models/lasr/modular_lasr.py +0 -574
- transformers/models/lasr/processing_lasr.py +0 -100
- transformers/models/lasr/tokenization_lasr.py +0 -184
- transformers/models/lighton_ocr/__init__.py +0 -28
- transformers/models/lighton_ocr/configuration_lighton_ocr.py +0 -128
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +0 -463
- transformers/models/lighton_ocr/modular_lighton_ocr.py +0 -404
- transformers/models/lighton_ocr/processing_lighton_ocr.py +0 -229
- transformers/models/lw_detr/__init__.py +0 -27
- transformers/models/lw_detr/configuration_lw_detr.py +0 -374
- transformers/models/lw_detr/modeling_lw_detr.py +0 -1702
- transformers/models/lw_detr/modular_lw_detr.py +0 -1615
- transformers/models/minimax_m2/__init__.py +0 -28
- transformers/models/minimax_m2/configuration_minimax_m2.py +0 -188
- transformers/models/minimax_m2/modeling_minimax_m2.py +0 -704
- transformers/models/minimax_m2/modular_minimax_m2.py +0 -346
- transformers/models/paddleocr_vl/__init__.py +0 -31
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +0 -335
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +0 -503
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +0 -209
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +0 -1683
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +0 -1380
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +0 -133
- transformers/models/pe_audio/__init__.py +0 -29
- transformers/models/pe_audio/configuration_pe_audio.py +0 -204
- transformers/models/pe_audio/feature_extraction_pe_audio.py +0 -160
- transformers/models/pe_audio/modeling_pe_audio.py +0 -819
- transformers/models/pe_audio/modular_pe_audio.py +0 -298
- transformers/models/pe_audio_video/__init__.py +0 -28
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +0 -223
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +0 -971
- transformers/models/pe_audio_video/modular_pe_audio_video.py +0 -763
- transformers/models/pe_video/__init__.py +0 -29
- transformers/models/pe_video/configuration_pe_video.py +0 -209
- transformers/models/pe_video/modeling_pe_video.py +0 -647
- transformers/models/pe_video/modular_pe_video.py +0 -231
- transformers/models/pe_video/processing_pe_video.py +0 -10
- transformers/models/pe_video/video_processing_pe_video.py +0 -64
- transformers/models/pixio/__init__.py +0 -29
- transformers/models/pixio/configuration_pixio.py +0 -150
- transformers/models/pixio/modeling_pixio.py +0 -507
- transformers/models/pixio/modular_pixio.py +0 -403
- transformers/models/solar_open/__init__.py +0 -27
- transformers/models/solar_open/configuration_solar_open.py +0 -184
- transformers/models/solar_open/modeling_solar_open.py +0 -642
- transformers/models/solar_open/modular_solar_open.py +0 -224
- transformers/trainer_jit_checkpoint.py +0 -125
- transformers-5.0.0.dist-info/RECORD +0 -2068
- {transformers-5.0.0.dist-info/licenses → transformers-5.0.0rc0.dist-info}/LICENSE +0 -0
- {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/top_level.txt +0 -0
|
@@ -14,39 +14,39 @@
|
|
|
14
14
|
import os
|
|
15
15
|
import re
|
|
16
16
|
import shutil
|
|
17
|
-
|
|
17
|
+
import warnings
|
|
18
|
+
from collections.abc import Callable, Mapping, Sized
|
|
18
19
|
from enum import Enum
|
|
19
20
|
from pathlib import Path
|
|
20
|
-
from typing import Any,
|
|
21
|
+
from typing import Any, Union, overload
|
|
21
22
|
|
|
22
23
|
import numpy as np
|
|
23
24
|
from huggingface_hub import create_repo
|
|
24
25
|
|
|
25
26
|
from transformers.audio_utils import load_audio_as
|
|
26
27
|
from transformers.tokenization_utils_base import (
|
|
28
|
+
LARGE_INTEGER,
|
|
27
29
|
VERY_LARGE_INTEGER,
|
|
28
|
-
AddedToken,
|
|
29
30
|
BatchEncoding,
|
|
30
31
|
EncodedInput,
|
|
31
32
|
PreTokenizedInput,
|
|
32
|
-
PreTrainedTokenizerBase,
|
|
33
33
|
TextInput,
|
|
34
34
|
TruncationStrategy,
|
|
35
35
|
)
|
|
36
36
|
from transformers.utils import PaddingStrategy, TensorType, add_end_docstrings, logging, to_py_obj
|
|
37
|
+
from transformers.utils.generic import is_torch_tensor
|
|
38
|
+
from transformers.utils.hub import PushToHubMixin
|
|
37
39
|
from transformers.utils.import_utils import is_mistral_common_available, is_torch_available, requires
|
|
38
40
|
|
|
39
41
|
|
|
40
42
|
if is_mistral_common_available():
|
|
41
43
|
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
|
42
44
|
from mistral_common.protocol.instruct.validator import ValidationMode
|
|
43
|
-
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy,
|
|
45
|
+
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, TokenizerVersion
|
|
46
|
+
from mistral_common.tokens.tokenizers.image import MultiModalVersion
|
|
44
47
|
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
|
45
48
|
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
|
|
46
|
-
from mistral_common.tokens.tokenizers.utils import
|
|
47
|
-
download_tokenizer_from_hf_hub,
|
|
48
|
-
get_one_valid_tokenizer_file,
|
|
49
|
-
)
|
|
49
|
+
from mistral_common.tokens.tokenizers.utils import download_tokenizer_from_hf_hub
|
|
50
50
|
|
|
51
51
|
|
|
52
52
|
if is_torch_available():
|
|
@@ -103,10 +103,6 @@ ENCODE_KWARGS_DOCSTRING = r"""
|
|
|
103
103
|
"""
|
|
104
104
|
|
|
105
105
|
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
|
|
106
|
-
return_token_type_ids (`bool`, *optional*):
|
|
107
|
-
Whether to return token type IDs. For `MistralCommonBackend` it returns a list of zeros of the sequence length as only one sequence is supported.
|
|
108
|
-
|
|
109
|
-
[What are token type IDs?](../glossary#token-type-ids)
|
|
110
106
|
return_attention_mask (`bool`, *optional*):
|
|
111
107
|
Whether to return the attention mask. If left to the default, will return the attention mask according
|
|
112
108
|
to the specific tokenizer's default, defined by the `return_outputs` attribute.
|
|
@@ -122,8 +118,6 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
|
|
|
122
118
|
Whether or not to return the lengths of the encoded inputs.
|
|
123
119
|
verbose (`bool`, *optional*, defaults to `True`):
|
|
124
120
|
Whether or not to print more information and warnings.
|
|
125
|
-
return_offsets_mapping (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
|
|
126
|
-
split_special_tokens (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
|
|
127
121
|
**kwargs: passed to the `self.tokenize()` method
|
|
128
122
|
|
|
129
123
|
Return:
|
|
@@ -155,35 +149,8 @@ class MistralTokenizerType(str, Enum):
|
|
|
155
149
|
tekken = "tekken"
|
|
156
150
|
|
|
157
151
|
|
|
158
|
-
@overload
|
|
159
|
-
def _maybe_remove_lang(text: str, skip_special_tokens: bool) -> str: ...
|
|
160
|
-
@overload
|
|
161
|
-
def _maybe_remove_lang(text: list[str], skip_special_tokens: bool) -> list[str]: ...
|
|
162
|
-
def _maybe_remove_lang(text: str | list[str], skip_special_tokens: bool) -> str | list[str]:
|
|
163
|
-
# in the specific case of Voxtral, the added f"lang:xx" (always a two char language code since it follows ISO 639-1 alpha-2 format)
|
|
164
|
-
# is not considered as a special token by mistral-common and is encoded/ decoded as normal text.
|
|
165
|
-
# Nevertheless we should remove it to ease users life.
|
|
166
|
-
if not skip_special_tokens:
|
|
167
|
-
return text
|
|
168
|
-
|
|
169
|
-
if isinstance(text, str):
|
|
170
|
-
return re.sub(r"^lang:[a-z]{2}", "", text)
|
|
171
|
-
|
|
172
|
-
return [re.sub(r"^lang:[a-z]{2}", "", string) for string in text]
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
_MAP_SPECIAL_TOKENS = {
|
|
176
|
-
"bos_token": SpecialTokens.bos.value,
|
|
177
|
-
"eos_token": SpecialTokens.eos.value,
|
|
178
|
-
"pad_token": SpecialTokens.pad.value,
|
|
179
|
-
"unk_token": SpecialTokens.unk.value,
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
_VALID_INIT_KWARGS = {"_from_auto", "backend", "files_loaded"}
|
|
183
|
-
|
|
184
|
-
|
|
185
152
|
@requires(backends=("mistral-common",))
|
|
186
|
-
class MistralCommonBackend(
|
|
153
|
+
class MistralCommonBackend(PushToHubMixin):
|
|
187
154
|
"""
|
|
188
155
|
Class to wrap `mistral-common` tokenizers.
|
|
189
156
|
|
|
@@ -198,13 +165,34 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
198
165
|
For more info on `mistral-common`, see [mistral-common](https://github.com/mistralai/mistral-common).
|
|
199
166
|
|
|
200
167
|
This class is a wrapper around a `mistral_common.tokens.tokenizers.mistral.MistralTokenizer`.
|
|
201
|
-
It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
-
|
|
168
|
+
It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer.
|
|
169
|
+
|
|
170
|
+
Supports the following methods from the `PreTrainedTokenizerBase` class:
|
|
171
|
+
|
|
172
|
+
- [`~MistralCommonBackend.get_vocab`]: Returns the vocabulary as a dictionary of token to index.
|
|
173
|
+
This is a lossy conversion for Tekkenizer as some decoding errors are collapsed into the same token.
|
|
174
|
+
- [`~MistralCommonBackend.encode`]: Encode a string to a list of integers.
|
|
175
|
+
- [`~MistralCommonBackend.decode`]: Decode a list of integers to a string.
|
|
176
|
+
- [`~MistralCommonBackend.batch_decode`]: Decode a batch of list of integers to a list of strings.
|
|
177
|
+
- [`~MistralCommonBackend.convert_tokens_to_ids`]: Convert a list of tokens to a list of integers.
|
|
178
|
+
- [`~MistralCommonBackend.convert_ids_to_tokens`]: Convert a list of integers to a list of tokens.
|
|
179
|
+
- [`~MistralCommonBackend.tokenize`]: Tokenize a string.
|
|
180
|
+
- [`~MistralCommonBackend.get_special_tokens_mask`]: Get the special tokens mask for a list of tokens.
|
|
181
|
+
- [`~MistralCommonBackend.prepare_for_model`]: Prepare a list of inputs for the model.
|
|
182
|
+
- [`~MistralCommonBackend.pad`]: Pad a list of inputs to the same length.
|
|
183
|
+
- [`~MistralCommonBackend.truncate_sequences`]: Truncate a list of sequences to the same length.
|
|
184
|
+
- [`~MistralCommonBackend.apply_chat_template`]: Apply a chat template to a list of messages.
|
|
185
|
+
- [`~MistralCommonBackend.__call__`]: Tokenize a string or a list of strings.
|
|
186
|
+
- [`~MistralCommonBackend.from_pretrained`]: Download and cache a pretrained tokenizer from the Hugging Face model hub or local directory.
|
|
187
|
+
- [`~MistralCommonBackend.save_pretrained`]: Save a tokenizer to a directory, so it can be reloaded using the `from_pretrained` class method.
|
|
188
|
+
- [`~MistralCommonBackend.push_to_hub`]: Upload tokenizer to the Hugging Face model hub.
|
|
189
|
+
|
|
190
|
+
Here are the key differences with the `PreTrainedTokenizerBase` class:
|
|
191
|
+
|
|
192
|
+
- Pair of sequences are not supported. The signature have been kept for compatibility but all arguments related to pair of sequences are ignored. The return values of pairs are returned as `None`.
|
|
206
193
|
- The `is_split_into_words` argument is not supported.
|
|
207
|
-
-
|
|
194
|
+
- The `return_token_type_ids` argument is not supported.
|
|
195
|
+
- It is not possible to add new tokens to the tokenizer. Also the special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("<s>")` will not return the ID of the `<s>` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `"<s>"`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens).
|
|
208
196
|
|
|
209
197
|
If you have suggestions to improve this class, please open an issue on the [mistral-common GitHub repository](https://github.com/mistralai/mistral-common/issues) if it is related to the tokenizer or on the [Transformers GitHub repository](https://github.com/huggingface/transformers/issues) if it is related to the Hugging Face interface.
|
|
210
198
|
"""
|
|
@@ -212,12 +200,6 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
212
200
|
model_input_names: list[str] = ["input_ids", "attention_mask"]
|
|
213
201
|
padding_side: str = "left"
|
|
214
202
|
truncation_side: str = "right"
|
|
215
|
-
SPECIAL_TOKENS_ATTRIBUTES = [
|
|
216
|
-
"bos_token",
|
|
217
|
-
"eos_token",
|
|
218
|
-
"unk_token",
|
|
219
|
-
"pad_token",
|
|
220
|
-
]
|
|
221
203
|
|
|
222
204
|
def __init__(
|
|
223
205
|
self,
|
|
@@ -244,7 +226,7 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
244
226
|
Path to the tokenizer file to load the `MistralTokenizer`.
|
|
245
227
|
mode (`Union[str, ValidationMode]`, *optional*, defaults to `ValidationMode.test`):
|
|
246
228
|
The mode to use for the tokenizer. This will be passed to the `MistralTokenizer` constructor. Possible values are:
|
|
247
|
-
- `"finetuning"` or `ValidationMode.finetuning`: The
|
|
229
|
+
- `"finetuning"` or `ValidationMode.finetuning`: The finetuning mode.
|
|
248
230
|
- `"test"` or `ValidationMode.test`: The test mode.
|
|
249
231
|
It changes how the tokenizer validates the input and prepares the request to the model.
|
|
250
232
|
model_max_length (`int`, *optional*):
|
|
@@ -258,49 +240,60 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
258
240
|
truncation_side (`str`, *optional*):
|
|
259
241
|
The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
|
|
260
242
|
Default value is picked from the class attribute of the same name.
|
|
261
|
-
model_input_names (`List[
|
|
243
|
+
model_input_names (`List[string]`, *optional*):
|
|
262
244
|
The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
|
|
263
245
|
`"attention_mask"`). Default value is picked from the class attribute of the same name.
|
|
264
246
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
|
|
265
|
-
Whether or not the model should
|
|
247
|
+
Whether or not the model should cleanup the spaces that were added when splitting the input text during the
|
|
266
248
|
tokenization process.
|
|
267
249
|
"""
|
|
268
|
-
if kwargs
|
|
250
|
+
if kwargs:
|
|
269
251
|
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported to init `MistralCommonBackend`.")
|
|
270
252
|
|
|
271
|
-
self.init_kwargs = {
|
|
272
|
-
"tokenizer_path": tokenizer_path,
|
|
273
|
-
"mode": mode,
|
|
274
|
-
"model_max_length": model_max_length,
|
|
275
|
-
"padding_side": padding_side,
|
|
276
|
-
"truncation_side": truncation_side,
|
|
277
|
-
"model_input_names": model_input_names,
|
|
278
|
-
"clean_up_tokenization_spaces": clean_up_tokenization_spaces,
|
|
279
|
-
}
|
|
280
253
|
self._tokenizer_path = Path(tokenizer_path)
|
|
281
254
|
self._mode = self._get_validation_mode(mode)
|
|
282
|
-
|
|
283
255
|
self.tokenizer: MistralTokenizer = MistralTokenizer.from_file(str(self._tokenizer_path), mode=self._mode)
|
|
284
256
|
self._tokenizer_type = (
|
|
285
257
|
MistralTokenizerType.tekken
|
|
286
258
|
if isinstance(self.tokenizer.instruct_tokenizer.tokenizer, Tekkenizer)
|
|
287
259
|
else MistralTokenizerType.spm
|
|
288
260
|
)
|
|
289
|
-
self.
|
|
261
|
+
self.truncation_side = truncation_side
|
|
262
|
+
self.padding_side = padding_side
|
|
263
|
+
self.model_max_length = model_max_length
|
|
264
|
+
self.cleanup_tokenization_spaces = clean_up_tokenization_spaces
|
|
265
|
+
self.deprecation_warnings = {} # Use to store when we have already noticed a deprecation warning (avoid overlogging).
|
|
266
|
+
self._all_special_tokens_ids = self._get_all_special_ids()
|
|
267
|
+
|
|
268
|
+
if model_input_names is not None:
|
|
269
|
+
if (
|
|
270
|
+
not isinstance(model_input_names, (list, tuple))
|
|
271
|
+
and len(model_input_names) == 0
|
|
272
|
+
and not all(isinstance(i, str) for i in model_input_names)
|
|
273
|
+
):
|
|
274
|
+
raise ValueError(
|
|
275
|
+
"`model_input_names` should be a non-empty list or tuple of str but got an empty value."
|
|
276
|
+
)
|
|
277
|
+
self.model_input_names = model_input_names
|
|
290
278
|
|
|
291
|
-
self.
|
|
292
|
-
self._all_special_tokens = self.convert_ids_to_tokens(self.all_special_ids)
|
|
279
|
+
self._cache_get_vocab: dict[str, int] | None = None
|
|
293
280
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
281
|
+
@staticmethod
|
|
282
|
+
def clean_up_tokenization(text: str) -> str:
|
|
283
|
+
"""
|
|
284
|
+
Clean up a list of simple English tokenization artifacts like spaces before punctuation.
|
|
285
|
+
"""
|
|
286
|
+
return (
|
|
287
|
+
text.replace(" .", ".")
|
|
288
|
+
.replace(" ?", "?")
|
|
289
|
+
.replace(" !", "!")
|
|
290
|
+
.replace(" ,", ",")
|
|
291
|
+
.replace(" ' ", "'")
|
|
292
|
+
.replace(" n't", "n't")
|
|
293
|
+
.replace(" 'm", "'m")
|
|
294
|
+
.replace(" 's", "'s")
|
|
295
|
+
.replace(" 've", "'ve")
|
|
296
|
+
.replace(" 're", "'re")
|
|
304
297
|
)
|
|
305
298
|
|
|
306
299
|
@property
|
|
@@ -313,19 +306,75 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
313
306
|
"""
|
|
314
307
|
return self._mode
|
|
315
308
|
|
|
309
|
+
@property
|
|
310
|
+
def bos_token_id(self) -> int:
|
|
311
|
+
"""
|
|
312
|
+
Id of the beginning of sentence token in the vocabulary.
|
|
313
|
+
"""
|
|
314
|
+
return self.tokenizer.instruct_tokenizer.tokenizer.bos_id
|
|
315
|
+
|
|
316
|
+
@property
|
|
317
|
+
def eos_token_id(self) -> int:
|
|
318
|
+
"""
|
|
319
|
+
Id of the end of sentence token in the vocabulary.
|
|
320
|
+
"""
|
|
321
|
+
return self.tokenizer.instruct_tokenizer.tokenizer.eos_id
|
|
322
|
+
|
|
323
|
+
@property
|
|
324
|
+
def unk_token_id(self) -> int:
|
|
325
|
+
"""
|
|
326
|
+
Id of the unknown token in the vocabulary.
|
|
327
|
+
"""
|
|
328
|
+
return self.tokenizer.instruct_tokenizer.tokenizer.unk_id
|
|
329
|
+
|
|
330
|
+
@property
|
|
331
|
+
def pad_token_id(self) -> int:
|
|
332
|
+
"""
|
|
333
|
+
Id of the padding token in the vocabulary.
|
|
334
|
+
"""
|
|
335
|
+
return self.tokenizer.instruct_tokenizer.tokenizer.pad_id
|
|
336
|
+
|
|
337
|
+
@property
|
|
338
|
+
def bos_token(self) -> str:
|
|
339
|
+
"""
|
|
340
|
+
String associated to the beginning of sentence token in the vocabulary.
|
|
341
|
+
"""
|
|
342
|
+
return self.convert_ids_to_tokens(self.bos_token_id)
|
|
343
|
+
|
|
344
|
+
@property
|
|
345
|
+
def eos_token(self) -> str:
|
|
346
|
+
"""
|
|
347
|
+
String associated to the end of sentence token in the vocabulary.
|
|
348
|
+
"""
|
|
349
|
+
return self.convert_ids_to_tokens(self.eos_token_id)
|
|
350
|
+
|
|
351
|
+
@property
|
|
352
|
+
def unk_token(self) -> str:
|
|
353
|
+
"""
|
|
354
|
+
String associated to the unknown token in the vocabulary.
|
|
355
|
+
"""
|
|
356
|
+
return self.convert_ids_to_tokens(self.unk_token_id)
|
|
357
|
+
|
|
358
|
+
@property
|
|
359
|
+
def pad_token(self) -> str:
|
|
360
|
+
"""
|
|
361
|
+
String associated to the padding token in the vocabulary.
|
|
362
|
+
"""
|
|
363
|
+
return self.convert_ids_to_tokens(self.pad_token_id)
|
|
364
|
+
|
|
316
365
|
@property
|
|
317
366
|
def all_special_ids(self) -> list[int]:
|
|
318
367
|
"""
|
|
319
368
|
`list[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.).
|
|
320
369
|
"""
|
|
321
|
-
return sorted(self.
|
|
370
|
+
return sorted(self._all_special_tokens_ids)
|
|
322
371
|
|
|
323
372
|
@property
|
|
324
373
|
def all_special_tokens(self) -> list[str]:
|
|
325
374
|
"""
|
|
326
375
|
`list[str]`: A list of all unique special tokens.
|
|
327
376
|
"""
|
|
328
|
-
return self.
|
|
377
|
+
return self.convert_ids_to_tokens(self.all_special_ids)
|
|
329
378
|
|
|
330
379
|
@property
|
|
331
380
|
def vocab_size(self) -> int:
|
|
@@ -386,8 +435,6 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
386
435
|
padding_side: str | None = None,
|
|
387
436
|
return_tensors: str | TensorType | None = None,
|
|
388
437
|
verbose: bool = True,
|
|
389
|
-
return_offsets_mapping: Literal[False] = False,
|
|
390
|
-
split_special_tokens: Literal[False] = False,
|
|
391
438
|
**kwargs,
|
|
392
439
|
) -> list[int]:
|
|
393
440
|
"""
|
|
@@ -399,81 +446,37 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
399
446
|
text_pair (`None`, *optional*):
|
|
400
447
|
Not supported by `MistralCommonBackend.encode`. Kept to match `PreTrainedTokenizerBase.encode` signature.
|
|
401
448
|
"""
|
|
402
|
-
if return_offsets_mapping or split_special_tokens:
|
|
403
|
-
raise ValueError(
|
|
404
|
-
"`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
|
|
405
|
-
)
|
|
406
|
-
|
|
407
|
-
if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND, "only_first", "only_second"]:
|
|
408
|
-
raise ValueError(
|
|
409
|
-
"Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
|
|
410
|
-
)
|
|
411
|
-
|
|
412
449
|
if kwargs:
|
|
413
450
|
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.encode`.")
|
|
414
|
-
|
|
415
451
|
if text_pair:
|
|
416
452
|
raise ValueError("`MistralCommonBackend.encode` does not support `text_pair`.")
|
|
417
453
|
|
|
418
|
-
|
|
419
|
-
text=text,
|
|
420
|
-
text_pair=text_pair,
|
|
421
|
-
add_special_tokens=add_special_tokens,
|
|
454
|
+
padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
|
|
422
455
|
padding=padding,
|
|
423
456
|
truncation=truncation,
|
|
424
457
|
max_length=max_length,
|
|
425
|
-
stride=stride,
|
|
426
|
-
return_tensors=return_tensors,
|
|
427
458
|
pad_to_multiple_of=pad_to_multiple_of,
|
|
428
|
-
padding_side=padding_side,
|
|
429
459
|
verbose=verbose,
|
|
430
460
|
)
|
|
431
461
|
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
|
|
448
|
-
|
|
449
|
-
text = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
|
|
450
|
-
|
|
451
|
-
# Apply tokenizer-specific cleanup if available and requested
|
|
452
|
-
clean_up_tokenization_spaces = (
|
|
453
|
-
clean_up_tokenization_spaces
|
|
454
|
-
if clean_up_tokenization_spaces is not None
|
|
455
|
-
else self.clean_up_tokenization_spaces
|
|
462
|
+
encoded_inputs = self._encode_plus(
|
|
463
|
+
text,
|
|
464
|
+
add_special_tokens=add_special_tokens,
|
|
465
|
+
padding_strategy=padding_strategy,
|
|
466
|
+
truncation_strategy=truncation_strategy,
|
|
467
|
+
max_length=max_length,
|
|
468
|
+
stride=stride,
|
|
469
|
+
pad_to_multiple_of=pad_to_multiple_of,
|
|
470
|
+
padding_side=padding_side,
|
|
471
|
+
return_tensors=return_tensors,
|
|
472
|
+
return_attention_mask=False,
|
|
473
|
+
return_overflowing_tokens=False,
|
|
474
|
+
return_special_tokens_mask=False,
|
|
475
|
+
return_length=False,
|
|
476
|
+
verbose=verbose,
|
|
456
477
|
)
|
|
457
|
-
if clean_up_tokenization_spaces:
|
|
458
|
-
# Call custom cleanup method if it exists (e.g., for CLVP's [SPACE] token replacement)
|
|
459
|
-
if hasattr(self, "clean_up_tokenization") and callable(self.clean_up_tokenization):
|
|
460
|
-
text = self.clean_up_tokenization(text)
|
|
461
|
-
else:
|
|
462
|
-
# Otherwise apply standard cleanup
|
|
463
|
-
text = (
|
|
464
|
-
text.replace(" .", ".")
|
|
465
|
-
.replace(" ?", "?")
|
|
466
|
-
.replace(" !", "!")
|
|
467
|
-
.replace(" ,", ",")
|
|
468
|
-
.replace(" ' ", "'")
|
|
469
|
-
.replace(" n't", "n't")
|
|
470
|
-
.replace(" 'm", "'m")
|
|
471
|
-
.replace(" 's", "'s")
|
|
472
|
-
.replace(" 've", "'ve")
|
|
473
|
-
.replace(" 're", "'re")
|
|
474
|
-
)
|
|
475
478
|
|
|
476
|
-
return
|
|
479
|
+
return encoded_inputs["input_ids"]
|
|
477
480
|
|
|
478
481
|
def decode(
|
|
479
482
|
self,
|
|
@@ -481,7 +484,7 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
481
484
|
skip_special_tokens: bool = False,
|
|
482
485
|
clean_up_tokenization_spaces: bool | None = None,
|
|
483
486
|
**kwargs,
|
|
484
|
-
) -> str
|
|
487
|
+
) -> Union[str, list[str]]:
|
|
485
488
|
"""
|
|
486
489
|
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
|
|
487
490
|
tokens and clean up tokenization spaces.
|
|
@@ -506,7 +509,16 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
506
509
|
if kwargs:
|
|
507
510
|
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.decode`.")
|
|
508
511
|
|
|
509
|
-
|
|
512
|
+
token_ids = to_py_obj(token_ids)
|
|
513
|
+
|
|
514
|
+
if isinstance(token_ids, (list, tuple)) and len(token_ids) > 0 and isinstance(token_ids[0], (list, tuple)):
|
|
515
|
+
return self._batch_decode(
|
|
516
|
+
sequences=token_ids,
|
|
517
|
+
skip_special_tokens=skip_special_tokens,
|
|
518
|
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
return self._decode(
|
|
510
522
|
token_ids=token_ids,
|
|
511
523
|
skip_special_tokens=skip_special_tokens,
|
|
512
524
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
@@ -543,12 +555,63 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
543
555
|
if kwargs:
|
|
544
556
|
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.batch_decode`.")
|
|
545
557
|
|
|
546
|
-
return
|
|
558
|
+
return self._batch_decode(
|
|
547
559
|
sequences=sequences,
|
|
548
560
|
skip_special_tokens=skip_special_tokens,
|
|
549
561
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
550
562
|
)
|
|
551
563
|
|
|
564
|
+
def _decode(
|
|
565
|
+
self,
|
|
566
|
+
token_ids: Union[int, list[int], list[list[int]], np.ndarray, "torch.Tensor"],
|
|
567
|
+
skip_special_tokens: bool = False,
|
|
568
|
+
clean_up_tokenization_spaces: bool | None = None,
|
|
569
|
+
) -> str:
|
|
570
|
+
clean_up_tokenization_spaces = clean_up_tokenization_spaces or self.cleanup_tokenization_spaces
|
|
571
|
+
|
|
572
|
+
# Convert inputs to python lists
|
|
573
|
+
if isinstance(token_ids, int):
|
|
574
|
+
token_ids = [token_ids]
|
|
575
|
+
|
|
576
|
+
token_ids = to_py_obj(token_ids)
|
|
577
|
+
|
|
578
|
+
special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
|
|
579
|
+
|
|
580
|
+
decoded_string = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
|
|
581
|
+
if clean_up_tokenization_spaces:
|
|
582
|
+
decoded_string = self.clean_up_tokenization(decoded_string)
|
|
583
|
+
|
|
584
|
+
# in the specific case of Voxtral, the added f"lang:xx" (always a two char language code since it follows ISO 639-1 alpha-2 format)
|
|
585
|
+
# is not considered as a special token by mistral-common and is encoded/ decoded as normal text.
|
|
586
|
+
# Nevertheless we should remove it to ease users life.
|
|
587
|
+
if skip_special_tokens:
|
|
588
|
+
decoded_string = re.sub(r"^lang:[a-z]{2}", "", decoded_string)
|
|
589
|
+
|
|
590
|
+
return decoded_string
|
|
591
|
+
|
|
592
|
+
def _batch_decode(
|
|
593
|
+
self,
|
|
594
|
+
sequences: Union[list[int], list[list[int]], np.ndarray, "torch.Tensor"],
|
|
595
|
+
skip_special_tokens: bool = False,
|
|
596
|
+
clean_up_tokenization_spaces: bool | None = None,
|
|
597
|
+
) -> list[str]:
|
|
598
|
+
return [
|
|
599
|
+
self._decode(
|
|
600
|
+
seq,
|
|
601
|
+
skip_special_tokens=skip_special_tokens,
|
|
602
|
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
603
|
+
)
|
|
604
|
+
for seq in sequences
|
|
605
|
+
]
|
|
606
|
+
|
|
607
|
+
def _is_control_token(self, token_id: int) -> bool:
|
|
608
|
+
if self._tokenizer_type == MistralTokenizerType.spm:
|
|
609
|
+
return token_id in self.tokenizer.instruct_tokenizer.tokenizer._control_tokens
|
|
610
|
+
elif self._tokenizer_type == MistralTokenizerType.tekken:
|
|
611
|
+
return token_id < self.tokenizer.instruct_tokenizer.tokenizer.num_special_tokens
|
|
612
|
+
else:
|
|
613
|
+
raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
|
|
614
|
+
|
|
552
615
|
@overload
|
|
553
616
|
def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
|
|
554
617
|
@overload
|
|
@@ -569,22 +632,22 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
569
632
|
"""
|
|
570
633
|
|
|
571
634
|
if isinstance(ids, int):
|
|
572
|
-
|
|
635
|
+
one_token = True
|
|
573
636
|
ids = [ids]
|
|
574
637
|
else:
|
|
575
|
-
|
|
638
|
+
one_token = False
|
|
576
639
|
|
|
577
640
|
tokens: list[str] = []
|
|
578
641
|
for token_id in ids:
|
|
579
|
-
if self.
|
|
642
|
+
if self._is_control_token(token_id) and skip_special_tokens:
|
|
580
643
|
continue
|
|
581
644
|
tokens.append(self.tokenizer.instruct_tokenizer.tokenizer.id_to_piece(token_id))
|
|
582
645
|
|
|
583
|
-
if
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
return tokens[0]
|
|
646
|
+
if one_token:
|
|
647
|
+
if tokens == []:
|
|
648
|
+
raise ValueError(f"Invalid token id {ids}.")
|
|
587
649
|
|
|
650
|
+
return tokens[0]
|
|
588
651
|
return tokens
|
|
589
652
|
|
|
590
653
|
def _tekken_piece_to_id(self, piece: str, warn: bool) -> int:
|
|
@@ -645,13 +708,7 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
645
708
|
tokens_ids = self.tokenizer.instruct_tokenizer.tokenizer.encode(text, bos=add_special_tokens, eos=add_eos)
|
|
646
709
|
return tokens_ids
|
|
647
710
|
|
|
648
|
-
def tokenize(
|
|
649
|
-
self,
|
|
650
|
-
text: TextInput,
|
|
651
|
-
return_offsets_mapping: Literal[False] = False,
|
|
652
|
-
split_special_tokens: Literal[False] = False,
|
|
653
|
-
**kwargs,
|
|
654
|
-
) -> list[str]:
|
|
711
|
+
def tokenize(self, text: TextInput, **kwargs) -> list[str]:
|
|
655
712
|
"""
|
|
656
713
|
Converts a string into a sequence of tokens, using the tokenizer.
|
|
657
714
|
|
|
@@ -660,8 +717,6 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
660
717
|
Args:
|
|
661
718
|
text (`str`):
|
|
662
719
|
The sequence to be encoded.
|
|
663
|
-
return_offsets_mapping (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
|
|
664
|
-
split_special_tokens (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
|
|
665
720
|
**kwargs (additional keyword arguments):
|
|
666
721
|
Not supported by `MistralCommonBackend.tokenize`.
|
|
667
722
|
Will raise an error if used.
|
|
@@ -669,164 +724,40 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
669
724
|
Returns:
|
|
670
725
|
`list[str]`: The list of tokens.
|
|
671
726
|
"""
|
|
672
|
-
if return_offsets_mapping or split_special_tokens:
|
|
673
|
-
raise ValueError(
|
|
674
|
-
"`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
|
|
675
|
-
)
|
|
676
|
-
|
|
677
727
|
if kwargs:
|
|
678
728
|
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.tokenize`.")
|
|
679
729
|
|
|
680
730
|
return self.convert_ids_to_tokens(self._text_to_ids(text, add_special_tokens=False), skip_special_tokens=False)
|
|
681
731
|
|
|
682
|
-
def
|
|
683
|
-
if self._tokenizer_type == MistralTokenizerType.tekken:
|
|
684
|
-
return self.tokenizer.instruct_tokenizer.tokenizer._special_token_ids
|
|
685
|
-
elif self._tokenizer_type == MistralTokenizerType.spm:
|
|
686
|
-
return {
|
|
687
|
-
token_id
|
|
688
|
-
for token_id in range(self.tokenizer.instruct_tokenizer.tokenizer.n_words)
|
|
689
|
-
if self.tokenizer.instruct_tokenizer.tokenizer.is_special(token_id)
|
|
690
|
-
}
|
|
691
|
-
else:
|
|
692
|
-
raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
|
|
693
|
-
|
|
694
|
-
def get_special_tokens_mask(
|
|
695
|
-
self, token_ids_0: list[int], token_ids_1: None = None, already_has_special_tokens: bool = False
|
|
696
|
-
) -> list[int]:
|
|
697
|
-
"""
|
|
698
|
-
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
|
699
|
-
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
|
|
700
|
-
|
|
701
|
-
Args:
|
|
702
|
-
token_ids_0 (`list[int]`): List of ids of the sequence.
|
|
703
|
-
token_ids_1 (`None`, *optional*): None, kept to match Transformers' implementation.
|
|
704
|
-
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
|
705
|
-
Whether or not the token list is already formatted with special tokens for the model.
|
|
706
|
-
|
|
707
|
-
Returns:
|
|
708
|
-
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
|
709
|
-
"""
|
|
710
|
-
if token_ids_1 is not None:
|
|
711
|
-
raise ValueError(
|
|
712
|
-
"`token_ids_1` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
|
|
713
|
-
)
|
|
714
|
-
|
|
715
|
-
if already_has_special_tokens:
|
|
716
|
-
return [1 if int(token_id) in self._all_special_ids else 0 for token_id in token_ids_0]
|
|
717
|
-
|
|
718
|
-
if self.mode == ValidationMode.test:
|
|
719
|
-
# [BOS] seq0
|
|
720
|
-
return [1] + ([0] * len(token_ids_0))
|
|
721
|
-
else:
|
|
722
|
-
# [BOS] seq0 [EOS]
|
|
723
|
-
return [1] + ([0] * len(token_ids_0)) + [1]
|
|
724
|
-
|
|
725
|
-
def _encode_plus( # type: ignore[override]
|
|
732
|
+
def _encode_plus(
|
|
726
733
|
self,
|
|
727
|
-
text: TextInput |
|
|
728
|
-
text_pair: None = None,
|
|
734
|
+
text: TextInput | EncodedInput,
|
|
729
735
|
add_special_tokens: bool = True,
|
|
730
736
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
|
731
737
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
|
732
738
|
max_length: int | None = None,
|
|
733
739
|
stride: int = 0,
|
|
734
|
-
is_split_into_words: bool = False,
|
|
735
740
|
pad_to_multiple_of: int | None = None,
|
|
736
741
|
padding_side: str | None = None,
|
|
737
742
|
return_tensors: str | TensorType | None = None,
|
|
738
|
-
return_token_type_ids: bool | None = None,
|
|
739
743
|
return_attention_mask: bool | None = None,
|
|
740
744
|
return_overflowing_tokens: bool = False,
|
|
741
745
|
return_special_tokens_mask: bool = False,
|
|
742
746
|
return_length: bool = False,
|
|
743
747
|
verbose: bool = True,
|
|
744
|
-
return_offsets_mapping: Literal[False] = False,
|
|
745
|
-
split_special_tokens: Literal[False] = False,
|
|
746
|
-
**kwargs,
|
|
747
748
|
) -> BatchEncoding:
|
|
748
|
-
# Detect batched inputs (list of sequences)
|
|
749
|
-
if text_pair is not None:
|
|
750
|
-
raise ValueError("`MistralCommonBackend` does not support `text_pair != None` for `_encode_plus`.")
|
|
751
|
-
|
|
752
|
-
if return_offsets_mapping or split_special_tokens:
|
|
753
|
-
raise ValueError(
|
|
754
|
-
"`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
|
|
755
|
-
)
|
|
756
|
-
|
|
757
|
-
if kwargs:
|
|
758
|
-
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend._encode_plus`.")
|
|
759
|
-
|
|
760
|
-
is_batched = isinstance(text, (list, tuple)) and (
|
|
761
|
-
(not text and not is_split_into_words)
|
|
762
|
-
or (text and is_split_into_words and isinstance(text[0], (list, tuple)))
|
|
763
|
-
or (text and not is_split_into_words and isinstance(text[0], (str, list, tuple)))
|
|
764
|
-
)
|
|
765
|
-
|
|
766
|
-
if is_batched:
|
|
767
|
-
batch_outputs = {}
|
|
768
|
-
one_overflowed = False
|
|
769
|
-
for current_text in text:
|
|
770
|
-
current_output = self._encode_plus(
|
|
771
|
-
text=current_text,
|
|
772
|
-
text_pair=None,
|
|
773
|
-
add_special_tokens=add_special_tokens,
|
|
774
|
-
padding_strategy=PaddingStrategy.DO_NOT_PAD, # we pad in batch afterward
|
|
775
|
-
truncation_strategy=truncation_strategy,
|
|
776
|
-
max_length=max_length,
|
|
777
|
-
stride=stride,
|
|
778
|
-
is_split_into_words=is_split_into_words,
|
|
779
|
-
pad_to_multiple_of=None, # we pad in batch afterward
|
|
780
|
-
padding_side=None, # we pad in batch afterward
|
|
781
|
-
return_tensors=None, # We convert the whole batch to tensors at the end
|
|
782
|
-
return_token_type_ids=return_token_type_ids,
|
|
783
|
-
return_attention_mask=False, # we pad in batch afterward
|
|
784
|
-
return_overflowing_tokens=return_overflowing_tokens,
|
|
785
|
-
return_special_tokens_mask=return_special_tokens_mask,
|
|
786
|
-
return_length=return_length,
|
|
787
|
-
verbose=verbose,
|
|
788
|
-
)
|
|
789
|
-
for key, value in current_output.items():
|
|
790
|
-
batch_outputs.setdefault(key, []).append(value)
|
|
791
|
-
|
|
792
|
-
# To ensure the list is built for each sample, we need to add this.
|
|
793
|
-
if return_overflowing_tokens and not return_tensors:
|
|
794
|
-
if "overflowing_tokens" not in current_output:
|
|
795
|
-
batch_outputs.setdefault("overflowing_tokens", []).append([0])
|
|
796
|
-
batch_outputs.setdefault("num_truncated_tokens", []).append([0])
|
|
797
|
-
else:
|
|
798
|
-
one_overflowed = True
|
|
799
|
-
|
|
800
|
-
# Remove overflow-related keys before tensor conversion if return_tensors is set
|
|
801
|
-
# Slow tokenizers don't support returning these as tensors
|
|
802
|
-
if return_overflowing_tokens and (return_tensors or not one_overflowed):
|
|
803
|
-
batch_outputs.pop("overflowing_tokens", None)
|
|
804
|
-
batch_outputs.pop("num_truncated_tokens", None)
|
|
805
|
-
|
|
806
|
-
batch_outputs = self.pad(
|
|
807
|
-
batch_outputs,
|
|
808
|
-
padding=padding_strategy.value,
|
|
809
|
-
max_length=max_length,
|
|
810
|
-
pad_to_multiple_of=pad_to_multiple_of,
|
|
811
|
-
padding_side=padding_side,
|
|
812
|
-
return_attention_mask=return_attention_mask,
|
|
813
|
-
)
|
|
814
|
-
|
|
815
|
-
return BatchEncoding(batch_outputs, tensor_type=return_tensors)
|
|
816
|
-
|
|
817
749
|
def get_input_ids(text):
|
|
818
750
|
if isinstance(text, str):
|
|
819
|
-
return self._text_to_ids(text,
|
|
751
|
+
return self._text_to_ids(text, add_special_tokens)
|
|
820
752
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
|
|
821
753
|
return text
|
|
822
754
|
else:
|
|
823
755
|
raise ValueError(f"Input {text} is not valid. Should be a string, or a list/tuple of integers.")
|
|
824
756
|
|
|
825
|
-
|
|
757
|
+
ids = get_input_ids(text)
|
|
826
758
|
|
|
827
759
|
return self.prepare_for_model(
|
|
828
|
-
|
|
829
|
-
pair_ids=None,
|
|
760
|
+
ids,
|
|
830
761
|
add_special_tokens=add_special_tokens,
|
|
831
762
|
padding=padding_strategy.value,
|
|
832
763
|
truncation=truncation_strategy.value,
|
|
@@ -837,128 +768,242 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
837
768
|
return_tensors=return_tensors,
|
|
838
769
|
prepend_batch_axis=True,
|
|
839
770
|
return_attention_mask=return_attention_mask,
|
|
840
|
-
return_token_type_ids=return_token_type_ids,
|
|
841
771
|
return_overflowing_tokens=return_overflowing_tokens,
|
|
842
772
|
return_special_tokens_mask=return_special_tokens_mask,
|
|
843
773
|
return_length=return_length,
|
|
844
774
|
verbose=verbose,
|
|
845
775
|
)
|
|
846
776
|
|
|
847
|
-
|
|
848
|
-
def prepare_for_model(
|
|
777
|
+
def _batch_encode_plus(
|
|
849
778
|
self,
|
|
850
|
-
|
|
851
|
-
pair_ids: None = None,
|
|
779
|
+
batch_text: list[TextInput] | list[EncodedInput],
|
|
852
780
|
add_special_tokens: bool = True,
|
|
853
|
-
|
|
854
|
-
|
|
781
|
+
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
|
782
|
+
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
|
855
783
|
max_length: int | None = None,
|
|
856
784
|
stride: int = 0,
|
|
857
785
|
pad_to_multiple_of: int | None = None,
|
|
858
786
|
padding_side: str | None = None,
|
|
859
787
|
return_tensors: str | TensorType | None = None,
|
|
860
|
-
return_token_type_ids: bool | None = None,
|
|
861
788
|
return_attention_mask: bool | None = None,
|
|
862
789
|
return_overflowing_tokens: bool = False,
|
|
863
790
|
return_special_tokens_mask: bool = False,
|
|
864
791
|
return_length: bool = False,
|
|
865
792
|
verbose: bool = True,
|
|
866
|
-
prepend_batch_axis: bool = False,
|
|
867
|
-
return_offsets_mapping: Literal[False] = False,
|
|
868
|
-
split_special_tokens: Literal[False] = False,
|
|
869
|
-
**kwargs,
|
|
870
793
|
) -> BatchEncoding:
|
|
794
|
+
def get_input_ids(text):
|
|
795
|
+
if isinstance(text, str):
|
|
796
|
+
return self._text_to_ids(text, add_special_tokens)
|
|
797
|
+
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
|
|
798
|
+
return text
|
|
799
|
+
else:
|
|
800
|
+
raise ValueError("Input is not valid. Should be a string or a list/tuple of integers.")
|
|
801
|
+
|
|
802
|
+
input_ids = []
|
|
803
|
+
for ids in batch_text:
|
|
804
|
+
input_ids.append(get_input_ids(ids))
|
|
805
|
+
|
|
806
|
+
batch_outputs = self._batch_prepare_for_model(
|
|
807
|
+
input_ids,
|
|
808
|
+
add_special_tokens=add_special_tokens,
|
|
809
|
+
padding_strategy=padding_strategy,
|
|
810
|
+
truncation_strategy=truncation_strategy,
|
|
811
|
+
max_length=max_length,
|
|
812
|
+
stride=stride,
|
|
813
|
+
pad_to_multiple_of=pad_to_multiple_of,
|
|
814
|
+
padding_side=padding_side,
|
|
815
|
+
return_attention_mask=return_attention_mask,
|
|
816
|
+
return_overflowing_tokens=return_overflowing_tokens,
|
|
817
|
+
return_special_tokens_mask=return_special_tokens_mask,
|
|
818
|
+
return_length=return_length,
|
|
819
|
+
return_tensors=return_tensors,
|
|
820
|
+
verbose=verbose,
|
|
821
|
+
)
|
|
822
|
+
|
|
823
|
+
return BatchEncoding(batch_outputs)
|
|
824
|
+
|
|
825
|
+
def _get_all_special_ids(self) -> set[int]:
|
|
826
|
+
if self._tokenizer_type == MistralTokenizerType.tekken:
|
|
827
|
+
return {t["rank"] for t in self.tokenizer.instruct_tokenizer.tokenizer._all_special_tokens}
|
|
828
|
+
elif self._tokenizer_type == MistralTokenizerType.spm:
|
|
829
|
+
return self.tokenizer.instruct_tokenizer.tokenizer._control_tokens
|
|
830
|
+
else:
|
|
831
|
+
raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
|
|
832
|
+
|
|
833
|
+
def get_special_tokens_mask(
|
|
834
|
+
self, token_ids_0: list, token_ids_1: None = None, already_has_special_tokens: bool = False
|
|
835
|
+
) -> list[int]:
|
|
871
836
|
"""
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
manages a moving window (with user defined stride) for overflowing tokens.
|
|
837
|
+
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
|
838
|
+
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
|
|
875
839
|
|
|
876
840
|
Args:
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
841
|
+
token_ids_0 (`list[int]`):
|
|
842
|
+
List of ids of the sequence.
|
|
843
|
+
token_ids_1 (`list[int]`, *optional*):
|
|
880
844
|
Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
|
|
845
|
+
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
|
846
|
+
Whether or not the token list is already formatted with special tokens for the model.
|
|
847
|
+
|
|
848
|
+
Returns:
|
|
849
|
+
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
|
881
850
|
"""
|
|
882
|
-
if
|
|
851
|
+
if token_ids_1 is not None:
|
|
883
852
|
raise ValueError(
|
|
884
|
-
"`
|
|
853
|
+
"`token_ids_1` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
|
|
885
854
|
)
|
|
886
|
-
|
|
887
|
-
if pair_ids is not None:
|
|
855
|
+
if already_has_special_tokens:
|
|
888
856
|
raise ValueError(
|
|
889
|
-
"`
|
|
857
|
+
"`already_has_special_tokens` is not supported by `MistralCommonBackend` and should be `False`."
|
|
890
858
|
)
|
|
891
859
|
|
|
892
|
-
if
|
|
893
|
-
|
|
894
|
-
f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.prepare_for_model`."
|
|
895
|
-
)
|
|
860
|
+
special_tokens_mask = [1 if token in self._all_special_tokens_ids else 0 for token in token_ids_0]
|
|
861
|
+
return special_tokens_mask
|
|
896
862
|
|
|
897
|
-
|
|
863
|
+
def _batch_prepare_for_model(
|
|
864
|
+
self,
|
|
865
|
+
batch_ids: list[PreTokenizedInput | list[int]],
|
|
866
|
+
add_special_tokens: bool = True,
|
|
867
|
+
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
|
868
|
+
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
|
869
|
+
max_length: int | None = None,
|
|
870
|
+
stride: int = 0,
|
|
871
|
+
pad_to_multiple_of: int | None = None,
|
|
872
|
+
padding_side: str | None = None,
|
|
873
|
+
return_tensors: str | None = None,
|
|
874
|
+
return_attention_mask: bool | None = None,
|
|
875
|
+
return_overflowing_tokens: bool = False,
|
|
876
|
+
return_special_tokens_mask: bool = False,
|
|
877
|
+
return_length: bool = False,
|
|
878
|
+
verbose: bool = True,
|
|
879
|
+
) -> BatchEncoding:
|
|
880
|
+
"""
|
|
881
|
+
Prepares a sequence of input id so that it can be used by the model. It
|
|
882
|
+
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
|
|
883
|
+
manages a moving window (with user defined stride) for overflowing tokens.
|
|
884
|
+
|
|
885
|
+
Args:
|
|
886
|
+
batch_ids: list of tokenized input ids
|
|
887
|
+
"""
|
|
888
|
+
|
|
889
|
+
batch_outputs = {}
|
|
890
|
+
for ids in batch_ids:
|
|
891
|
+
outputs = self.prepare_for_model(
|
|
892
|
+
ids,
|
|
893
|
+
add_special_tokens=add_special_tokens,
|
|
894
|
+
padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
|
|
895
|
+
truncation=truncation_strategy.value,
|
|
896
|
+
max_length=max_length,
|
|
897
|
+
stride=stride,
|
|
898
|
+
pad_to_multiple_of=None, # we pad in batch afterward
|
|
899
|
+
padding_side=None, # we pad in batch afterward
|
|
900
|
+
return_attention_mask=False, # we pad in batch afterward
|
|
901
|
+
return_overflowing_tokens=return_overflowing_tokens,
|
|
902
|
+
return_special_tokens_mask=return_special_tokens_mask,
|
|
903
|
+
return_length=return_length,
|
|
904
|
+
return_tensors=None, # We convert the whole batch to tensors at the end
|
|
905
|
+
prepend_batch_axis=False,
|
|
906
|
+
verbose=verbose,
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
for key, value in outputs.items():
|
|
910
|
+
if key not in batch_outputs:
|
|
911
|
+
batch_outputs[key] = []
|
|
912
|
+
batch_outputs[key].append(value)
|
|
913
|
+
|
|
914
|
+
batch_outputs = self.pad(
|
|
915
|
+
batch_outputs,
|
|
916
|
+
padding=padding_strategy.value,
|
|
917
|
+
max_length=max_length,
|
|
918
|
+
pad_to_multiple_of=pad_to_multiple_of,
|
|
919
|
+
padding_side=padding_side,
|
|
920
|
+
return_attention_mask=return_attention_mask,
|
|
921
|
+
)
|
|
922
|
+
|
|
923
|
+
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
|
|
924
|
+
|
|
925
|
+
return batch_outputs
|
|
926
|
+
|
|
927
|
+
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
|
|
928
|
+
def prepare_for_model(
|
|
929
|
+
self,
|
|
930
|
+
ids: list[int],
|
|
931
|
+
pair_ids: None = None,
|
|
932
|
+
add_special_tokens: bool = True,
|
|
933
|
+
padding: bool | str | PaddingStrategy = False,
|
|
934
|
+
truncation: bool | str | TruncationStrategy | None = None,
|
|
935
|
+
max_length: int | None = None,
|
|
936
|
+
stride: int = 0,
|
|
937
|
+
pad_to_multiple_of: int | None = None,
|
|
938
|
+
padding_side: str | None = None,
|
|
939
|
+
return_tensors: str | TensorType | None = None,
|
|
940
|
+
return_attention_mask: bool | None = None,
|
|
941
|
+
return_overflowing_tokens: bool = False,
|
|
942
|
+
return_special_tokens_mask: bool = False,
|
|
943
|
+
return_length: bool = False,
|
|
944
|
+
verbose: bool = True,
|
|
945
|
+
prepend_batch_axis: bool = False,
|
|
946
|
+
**kwargs,
|
|
947
|
+
) -> BatchEncoding:
|
|
948
|
+
"""
|
|
949
|
+
Prepares a sequence of input id so that it can be used by the model. It
|
|
950
|
+
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
|
|
951
|
+
manages a moving window (with user defined stride) for overflowing tokens.
|
|
952
|
+
|
|
953
|
+
Args:
|
|
954
|
+
ids (`list[int]`):
|
|
955
|
+
Tokenized input ids of the first sequence.
|
|
956
|
+
pair_ids (`None`, *optional*):
|
|
957
|
+
Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
|
|
958
|
+
"""
|
|
959
|
+
if pair_ids is not None:
|
|
960
|
+
raise ValueError(
|
|
961
|
+
"`pair_ids` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
|
|
962
|
+
)
|
|
963
|
+
if kwargs:
|
|
964
|
+
raise ValueError(
|
|
965
|
+
f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.prepare_for_model`."
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
|
|
898
969
|
padding=padding,
|
|
899
970
|
truncation=truncation,
|
|
900
971
|
max_length=max_length,
|
|
901
972
|
pad_to_multiple_of=pad_to_multiple_of,
|
|
902
973
|
verbose=verbose,
|
|
903
|
-
**kwargs,
|
|
904
974
|
)
|
|
905
975
|
|
|
906
|
-
|
|
907
|
-
if (
|
|
908
|
-
return_overflowing_tokens
|
|
909
|
-
and truncation_strategy == TruncationStrategy.LONGEST_FIRST
|
|
910
|
-
and pair_ids is not None
|
|
911
|
-
):
|
|
912
|
-
raise ValueError(
|
|
913
|
-
"Not possible to return overflowing tokens for pair of sequences with the "
|
|
914
|
-
"`longest_first`. Please select another truncation strategy than `longest_first`, "
|
|
915
|
-
"for instance `only_second` or `only_first`."
|
|
916
|
-
)
|
|
976
|
+
len_ids = len(ids)
|
|
917
977
|
|
|
918
|
-
#
|
|
919
|
-
if return_token_type_ids is None:
|
|
920
|
-
return_token_type_ids = "token_type_ids" in self.model_input_names
|
|
978
|
+
# Load from model defaults
|
|
921
979
|
if return_attention_mask is None:
|
|
922
980
|
return_attention_mask = "attention_mask" in self.model_input_names
|
|
923
981
|
|
|
924
|
-
|
|
925
|
-
num_special = self.num_special_tokens_to_add(pair=False) if add_special_tokens else 0
|
|
926
|
-
total_len = len(ids) + len(pair_ids or []) + num_special
|
|
982
|
+
encoded_inputs = {}
|
|
927
983
|
|
|
984
|
+
# Truncation: Handle max sequence length
|
|
928
985
|
overflowing_tokens = []
|
|
929
|
-
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and
|
|
986
|
+
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and len_ids > max_length:
|
|
930
987
|
ids, _, overflowing_tokens = self.truncate_sequences(
|
|
931
988
|
ids,
|
|
932
|
-
|
|
933
|
-
num_tokens_to_remove=total_len - max_length,
|
|
989
|
+
num_tokens_to_remove=len_ids - max_length,
|
|
934
990
|
truncation_strategy=truncation_strategy,
|
|
935
991
|
stride=stride,
|
|
936
992
|
)
|
|
937
993
|
|
|
938
|
-
|
|
939
|
-
if add_special_tokens:
|
|
940
|
-
sequence = self.build_inputs_with_special_tokens(ids, None)
|
|
941
|
-
token_type_ids = self.create_token_type_ids_from_sequences(ids, None)
|
|
942
|
-
else:
|
|
943
|
-
sequence = ids
|
|
944
|
-
token_type_ids = [0] * len(sequence)
|
|
945
|
-
|
|
946
|
-
# Build output
|
|
947
|
-
encoded_inputs = {"input_ids": sequence}
|
|
948
|
-
if return_token_type_ids:
|
|
949
|
-
encoded_inputs["token_type_ids"] = token_type_ids
|
|
950
|
-
if return_special_tokens_mask:
|
|
951
|
-
encoded_inputs["special_tokens_mask"] = (
|
|
952
|
-
self.get_special_tokens_mask(ids, None) if add_special_tokens else [0] * len(sequence)
|
|
953
|
-
)
|
|
954
|
-
if return_overflowing_tokens and not return_tensors and overflowing_tokens:
|
|
994
|
+
if return_overflowing_tokens:
|
|
955
995
|
encoded_inputs["overflowing_tokens"] = overflowing_tokens
|
|
956
|
-
encoded_inputs["num_truncated_tokens"] =
|
|
996
|
+
encoded_inputs["num_truncated_tokens"] = len_ids - max_length
|
|
957
997
|
|
|
958
|
-
#
|
|
959
|
-
self.
|
|
998
|
+
# Build output dictionary
|
|
999
|
+
encoded_inputs[self.model_input_names[0]] = ids
|
|
1000
|
+
if return_special_tokens_mask:
|
|
1001
|
+
if add_special_tokens:
|
|
1002
|
+
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, None)
|
|
1003
|
+
else:
|
|
1004
|
+
encoded_inputs["special_tokens_mask"] = [0] * len(ids)
|
|
960
1005
|
|
|
961
|
-
#
|
|
1006
|
+
# Padding
|
|
962
1007
|
if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
|
|
963
1008
|
encoded_inputs = self.pad(
|
|
964
1009
|
encoded_inputs,
|
|
@@ -972,9 +1017,362 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
972
1017
|
if return_length:
|
|
973
1018
|
encoded_inputs["length"] = len(encoded_inputs["input_ids"])
|
|
974
1019
|
|
|
975
|
-
|
|
1020
|
+
batch_outputs = BatchEncoding(
|
|
1021
|
+
encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
|
|
1022
|
+
)
|
|
1023
|
+
|
|
1024
|
+
return batch_outputs
|
|
976
1025
|
|
|
977
|
-
def
|
|
1026
|
+
def _get_padding_truncation_strategies(
|
|
1027
|
+
self,
|
|
1028
|
+
padding: str | PaddingStrategy | bool = False,
|
|
1029
|
+
truncation: str | TruncationStrategy | bool | None = None,
|
|
1030
|
+
max_length: int | None = None,
|
|
1031
|
+
pad_to_multiple_of: int | None = None,
|
|
1032
|
+
verbose: bool = True,
|
|
1033
|
+
**kwargs,
|
|
1034
|
+
):
|
|
1035
|
+
"""
|
|
1036
|
+
Find the correct padding/truncation strategy.
|
|
1037
|
+
"""
|
|
1038
|
+
|
|
1039
|
+
# Backward compatibility for previous behavior, maybe we should deprecate it:
|
|
1040
|
+
# If you only set max_length, it activates truncation for max_length
|
|
1041
|
+
if max_length is not None and padding is False and truncation is None:
|
|
1042
|
+
if verbose:
|
|
1043
|
+
if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
|
|
1044
|
+
logger.warning(
|
|
1045
|
+
"Truncation was not explicitly activated but `max_length` is provided a specific value, please"
|
|
1046
|
+
" use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
|
|
1047
|
+
" 'longest_first' truncation strategy."
|
|
1048
|
+
)
|
|
1049
|
+
self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
|
|
1050
|
+
truncation = "longest_first"
|
|
1051
|
+
|
|
1052
|
+
# Get padding strategy
|
|
1053
|
+
if padding is not False:
|
|
1054
|
+
if padding is True:
|
|
1055
|
+
if verbose:
|
|
1056
|
+
if max_length is not None and (
|
|
1057
|
+
truncation is None or truncation is False or truncation == "do_not_truncate"
|
|
1058
|
+
):
|
|
1059
|
+
warnings.warn(
|
|
1060
|
+
"`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
|
|
1061
|
+
"To pad to max length, use `padding='max_length'`."
|
|
1062
|
+
)
|
|
1063
|
+
padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
|
|
1064
|
+
elif not isinstance(padding, PaddingStrategy):
|
|
1065
|
+
padding_strategy = PaddingStrategy(padding)
|
|
1066
|
+
elif isinstance(padding, PaddingStrategy):
|
|
1067
|
+
padding_strategy = padding
|
|
1068
|
+
else:
|
|
1069
|
+
padding_strategy = PaddingStrategy.DO_NOT_PAD
|
|
1070
|
+
|
|
1071
|
+
# Get truncation strategy
|
|
1072
|
+
if truncation is not False and truncation is not None:
|
|
1073
|
+
if truncation is True:
|
|
1074
|
+
truncation_strategy = (
|
|
1075
|
+
TruncationStrategy.LONGEST_FIRST
|
|
1076
|
+
) # Default to truncate the longest sequences in pairs of inputs
|
|
1077
|
+
elif not isinstance(truncation, TruncationStrategy):
|
|
1078
|
+
truncation_strategy = TruncationStrategy(truncation)
|
|
1079
|
+
elif isinstance(truncation, TruncationStrategy):
|
|
1080
|
+
truncation_strategy = truncation
|
|
1081
|
+
if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]:
|
|
1082
|
+
raise ValueError(
|
|
1083
|
+
"Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
|
|
1084
|
+
)
|
|
1085
|
+
else:
|
|
1086
|
+
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
|
|
1087
|
+
|
|
1088
|
+
# Set max length if needed
|
|
1089
|
+
if max_length is None:
|
|
1090
|
+
if padding_strategy == PaddingStrategy.MAX_LENGTH:
|
|
1091
|
+
if self.model_max_length > LARGE_INTEGER:
|
|
1092
|
+
if verbose:
|
|
1093
|
+
if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
|
|
1094
|
+
logger.warning(
|
|
1095
|
+
"Asking to pad to max_length but no maximum length is provided and the model has no"
|
|
1096
|
+
" predefined maximum length. Default to no padding."
|
|
1097
|
+
)
|
|
1098
|
+
self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
|
|
1099
|
+
padding_strategy = PaddingStrategy.DO_NOT_PAD
|
|
1100
|
+
else:
|
|
1101
|
+
max_length = self.model_max_length
|
|
1102
|
+
|
|
1103
|
+
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
|
|
1104
|
+
if self.model_max_length > LARGE_INTEGER:
|
|
1105
|
+
if verbose:
|
|
1106
|
+
if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
|
|
1107
|
+
logger.warning(
|
|
1108
|
+
"Asking to truncate to max_length but no maximum length is provided and the model has"
|
|
1109
|
+
" no predefined maximum length. Default to no truncation."
|
|
1110
|
+
)
|
|
1111
|
+
self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
|
|
1112
|
+
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
|
|
1113
|
+
else:
|
|
1114
|
+
max_length = self.model_max_length
|
|
1115
|
+
|
|
1116
|
+
# Test if we have a padding token
|
|
1117
|
+
if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token is None or self.pad_token_id < 0):
|
|
1118
|
+
raise ValueError(
|
|
1119
|
+
"Asking to pad but the tokenizer does not have a padding token. "
|
|
1120
|
+
"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
|
|
1121
|
+
"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
|
|
1122
|
+
)
|
|
1123
|
+
|
|
1124
|
+
# Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
|
|
1125
|
+
if (
|
|
1126
|
+
truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
|
|
1127
|
+
and padding_strategy != PaddingStrategy.DO_NOT_PAD
|
|
1128
|
+
and pad_to_multiple_of is not None
|
|
1129
|
+
and max_length is not None
|
|
1130
|
+
and (max_length % pad_to_multiple_of != 0)
|
|
1131
|
+
):
|
|
1132
|
+
raise ValueError(
|
|
1133
|
+
"Truncation and padding are both activated but "
|
|
1134
|
+
f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
|
|
1135
|
+
)
|
|
1136
|
+
|
|
1137
|
+
return padding_strategy, truncation_strategy, max_length, kwargs
|
|
1138
|
+
|
|
1139
|
+
def _pad(
|
|
1140
|
+
self,
|
|
1141
|
+
encoded_inputs: dict[str, EncodedInput] | BatchEncoding,
|
|
1142
|
+
max_length: int | None = None,
|
|
1143
|
+
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
|
1144
|
+
pad_to_multiple_of: int | None = None,
|
|
1145
|
+
padding_side: str | None = None,
|
|
1146
|
+
return_attention_mask: bool | None = None,
|
|
1147
|
+
) -> dict:
|
|
1148
|
+
"""
|
|
1149
|
+
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
|
|
1150
|
+
|
|
1151
|
+
Args:
|
|
1152
|
+
encoded_inputs:
|
|
1153
|
+
Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
|
|
1154
|
+
max_length: maximum length of the returned list and optionally padding length (see below).
|
|
1155
|
+
Will truncate by taking into account the special tokens.
|
|
1156
|
+
padding_strategy: PaddingStrategy to use for padding.
|
|
1157
|
+
|
|
1158
|
+
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
|
1159
|
+
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
|
1160
|
+
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
|
1161
|
+
The tokenizer padding sides are defined in `padding_side` argument:
|
|
1162
|
+
|
|
1163
|
+
- 'left': pads on the left of the sequences
|
|
1164
|
+
- 'right': pads on the right of the sequences
|
|
1165
|
+
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
|
1166
|
+
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
|
1167
|
+
`>= 7.5` (Volta).
|
|
1168
|
+
padding_side:
|
|
1169
|
+
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
|
1170
|
+
Default value is picked from the class attribute of the same name.
|
|
1171
|
+
return_attention_mask:
|
|
1172
|
+
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
|
1173
|
+
"""
|
|
1174
|
+
# Load from model defaults
|
|
1175
|
+
if return_attention_mask is None:
|
|
1176
|
+
return_attention_mask = "attention_mask" in self.model_input_names
|
|
1177
|
+
|
|
1178
|
+
required_input = encoded_inputs[self.model_input_names[0]]
|
|
1179
|
+
|
|
1180
|
+
if padding_strategy == PaddingStrategy.LONGEST:
|
|
1181
|
+
max_length = len(required_input)
|
|
1182
|
+
|
|
1183
|
+
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
|
|
1184
|
+
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
|
|
1185
|
+
|
|
1186
|
+
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
|
|
1187
|
+
|
|
1188
|
+
# Initialize attention mask if not present.
|
|
1189
|
+
if return_attention_mask and "attention_mask" not in encoded_inputs:
|
|
1190
|
+
encoded_inputs["attention_mask"] = [1] * len(required_input)
|
|
1191
|
+
|
|
1192
|
+
if needs_to_be_padded:
|
|
1193
|
+
difference = max_length - len(required_input)
|
|
1194
|
+
padding_side = padding_side if padding_side is not None else self.padding_side
|
|
1195
|
+
|
|
1196
|
+
if padding_side == "right":
|
|
1197
|
+
if return_attention_mask:
|
|
1198
|
+
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
|
1199
|
+
if "special_tokens_mask" in encoded_inputs:
|
|
1200
|
+
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
|
1201
|
+
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
|
1202
|
+
elif padding_side == "left":
|
|
1203
|
+
if return_attention_mask:
|
|
1204
|
+
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
|
1205
|
+
if "special_tokens_mask" in encoded_inputs:
|
|
1206
|
+
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
|
1207
|
+
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
|
1208
|
+
else:
|
|
1209
|
+
raise ValueError(f"Invalid padding strategy:{padding_side}")
|
|
1210
|
+
|
|
1211
|
+
return encoded_inputs
|
|
1212
|
+
|
|
1213
|
+
def pad(
|
|
1214
|
+
self,
|
|
1215
|
+
encoded_inputs: BatchEncoding
|
|
1216
|
+
| list[BatchEncoding]
|
|
1217
|
+
| dict[str, EncodedInput]
|
|
1218
|
+
| dict[str, list[EncodedInput]]
|
|
1219
|
+
| list[dict[str, EncodedInput]],
|
|
1220
|
+
padding: bool | str | PaddingStrategy = True,
|
|
1221
|
+
max_length: int | None = None,
|
|
1222
|
+
pad_to_multiple_of: int | None = None,
|
|
1223
|
+
padding_side: str | None = None,
|
|
1224
|
+
return_attention_mask: bool | None = None,
|
|
1225
|
+
return_tensors: str | TensorType | None = None,
|
|
1226
|
+
verbose: bool = True,
|
|
1227
|
+
) -> BatchEncoding:
|
|
1228
|
+
"""
|
|
1229
|
+
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
|
|
1230
|
+
in the batch.
|
|
1231
|
+
|
|
1232
|
+
Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
|
|
1233
|
+
`self.pad_token_id`).
|
|
1234
|
+
<Tip>
|
|
1235
|
+
|
|
1236
|
+
If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors, the
|
|
1237
|
+
result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
|
|
1238
|
+
PyTorch tensors, you will lose the specific device of your tensors however.
|
|
1239
|
+
|
|
1240
|
+
</Tip>
|
|
1241
|
+
|
|
1242
|
+
Args:
|
|
1243
|
+
encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, list[int]]`, `Dict[str, list[list[int]]` or `List[Dict[str, list[int]]]`):
|
|
1244
|
+
Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, list[int]]`) or a batch of
|
|
1245
|
+
tokenized inputs (list of [`BatchEncoding`], *Dict[str, list[list[int]]]* or *List[Dict[str,
|
|
1246
|
+
list[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
|
|
1247
|
+
collate function.
|
|
1248
|
+
|
|
1249
|
+
Instead of `list[int]` you can have tensors (numpy arrays, PyTorch tensors), see
|
|
1250
|
+
the note above for the return type.
|
|
1251
|
+
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
|
|
1252
|
+
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
|
1253
|
+
index) among:
|
|
1254
|
+
|
|
1255
|
+
- `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
|
|
1256
|
+
sequence if provided).
|
|
1257
|
+
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
|
|
1258
|
+
acceptable input length for the model if that argument is not provided.
|
|
1259
|
+
- `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different
|
|
1260
|
+
lengths).
|
|
1261
|
+
max_length (`int`, *optional*):
|
|
1262
|
+
Maximum length of the returned list and optionally padding length (see above).
|
|
1263
|
+
pad_to_multiple_of (`int`, *optional*):
|
|
1264
|
+
If set will pad the sequence to a multiple of the provided value.
|
|
1265
|
+
|
|
1266
|
+
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
|
1267
|
+
`>= 7.5` (Volta).
|
|
1268
|
+
padding_side (`str`, *optional*):
|
|
1269
|
+
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
|
1270
|
+
Default value is picked from the class attribute of the same name.
|
|
1271
|
+
return_attention_mask (`bool`, *optional*):
|
|
1272
|
+
Whether to return the attention mask. If left to the default, will return the attention mask according
|
|
1273
|
+
to the specific tokenizer's default, defined by the `return_outputs` attribute.
|
|
1274
|
+
|
|
1275
|
+
[What are attention masks?](../glossary#attention-mask)
|
|
1276
|
+
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
1277
|
+
If set, will return tensors instead of list of python integers. Acceptable values are:
|
|
1278
|
+
|
|
1279
|
+
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
|
1280
|
+
- `'np'`: Return Numpy `np.ndarray` objects.
|
|
1281
|
+
verbose (`bool`, *optional*, defaults to `True`):
|
|
1282
|
+
Whether or not to print more information and warnings.
|
|
1283
|
+
"""
|
|
1284
|
+
# If we have a list of dicts, let's convert it in a dict of lists
|
|
1285
|
+
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
|
|
1286
|
+
if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
|
|
1287
|
+
# Call .keys() explicitly for compatibility with TensorDict and other Mapping subclasses
|
|
1288
|
+
encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
|
|
1289
|
+
|
|
1290
|
+
# The model's main input name, usually `input_ids`, has been passed for padding
|
|
1291
|
+
if self.model_input_names[0] not in encoded_inputs:
|
|
1292
|
+
raise ValueError(
|
|
1293
|
+
"You should supply an encoding or a list of encodings to this method "
|
|
1294
|
+
f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
|
|
1295
|
+
)
|
|
1296
|
+
|
|
1297
|
+
required_input = encoded_inputs[self.model_input_names[0]]
|
|
1298
|
+
|
|
1299
|
+
if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
|
|
1300
|
+
if return_attention_mask:
|
|
1301
|
+
encoded_inputs["attention_mask"] = []
|
|
1302
|
+
return encoded_inputs
|
|
1303
|
+
|
|
1304
|
+
# If we have PyTorch/NumPy tensors/arrays as inputs, we cast them as python objects
|
|
1305
|
+
# and rebuild them afterwards if no return_tensors is specified
|
|
1306
|
+
# Note that we lose the specific device the tensor may be on for PyTorch
|
|
1307
|
+
|
|
1308
|
+
first_element = required_input[0]
|
|
1309
|
+
if isinstance(first_element, (list, tuple)):
|
|
1310
|
+
# first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
|
|
1311
|
+
for item in required_input:
|
|
1312
|
+
if len(item) != 0:
|
|
1313
|
+
first_element = item[0]
|
|
1314
|
+
break
|
|
1315
|
+
# At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
|
|
1316
|
+
if not isinstance(first_element, (int, list, tuple)):
|
|
1317
|
+
if is_torch_tensor(first_element):
|
|
1318
|
+
return_tensors = "pt" if return_tensors is None else return_tensors
|
|
1319
|
+
elif isinstance(first_element, np.ndarray):
|
|
1320
|
+
return_tensors = "np" if return_tensors is None else return_tensors
|
|
1321
|
+
else:
|
|
1322
|
+
raise ValueError(
|
|
1323
|
+
f"type of {first_element} unknown: {type(first_element)}. "
|
|
1324
|
+
"Should be one of a python, numpy, or pytorch object."
|
|
1325
|
+
)
|
|
1326
|
+
|
|
1327
|
+
for key, value in encoded_inputs.items():
|
|
1328
|
+
encoded_inputs[key] = to_py_obj(value)
|
|
1329
|
+
|
|
1330
|
+
# Convert padding_strategy in PaddingStrategy
|
|
1331
|
+
padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
|
|
1332
|
+
padding=padding, max_length=max_length, verbose=verbose
|
|
1333
|
+
)
|
|
1334
|
+
|
|
1335
|
+
required_input = encoded_inputs[self.model_input_names[0]]
|
|
1336
|
+
if required_input and not isinstance(required_input[0], (list, tuple)):
|
|
1337
|
+
encoded_inputs = self._pad(
|
|
1338
|
+
encoded_inputs,
|
|
1339
|
+
max_length=max_length,
|
|
1340
|
+
padding_strategy=padding_strategy,
|
|
1341
|
+
pad_to_multiple_of=pad_to_multiple_of,
|
|
1342
|
+
padding_side=padding_side,
|
|
1343
|
+
return_attention_mask=return_attention_mask,
|
|
1344
|
+
)
|
|
1345
|
+
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
|
|
1346
|
+
|
|
1347
|
+
batch_size = len(required_input)
|
|
1348
|
+
assert all(len(v) == batch_size for v in encoded_inputs.values()), (
|
|
1349
|
+
"Some items in the output dictionary have a different batch size than others."
|
|
1350
|
+
)
|
|
1351
|
+
|
|
1352
|
+
if padding_strategy == PaddingStrategy.LONGEST:
|
|
1353
|
+
max_length = max(len(inputs) for inputs in required_input)
|
|
1354
|
+
padding_strategy = PaddingStrategy.MAX_LENGTH
|
|
1355
|
+
|
|
1356
|
+
batch_outputs = {}
|
|
1357
|
+
for i in range(batch_size):
|
|
1358
|
+
inputs = {k: v[i] for k, v in encoded_inputs.items()}
|
|
1359
|
+
outputs = self._pad(
|
|
1360
|
+
inputs,
|
|
1361
|
+
max_length=max_length,
|
|
1362
|
+
padding_strategy=padding_strategy,
|
|
1363
|
+
pad_to_multiple_of=pad_to_multiple_of,
|
|
1364
|
+
padding_side=padding_side,
|
|
1365
|
+
return_attention_mask=return_attention_mask,
|
|
1366
|
+
)
|
|
1367
|
+
|
|
1368
|
+
for key, value in outputs.items():
|
|
1369
|
+
if key not in batch_outputs:
|
|
1370
|
+
batch_outputs[key] = []
|
|
1371
|
+
batch_outputs[key].append(value)
|
|
1372
|
+
|
|
1373
|
+
return BatchEncoding(batch_outputs, tensor_type=return_tensors)
|
|
1374
|
+
|
|
1375
|
+
def truncate_sequences(
|
|
978
1376
|
self,
|
|
979
1377
|
ids: list[int],
|
|
980
1378
|
pair_ids: None = None,
|
|
@@ -1009,36 +1407,47 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1009
1407
|
`Tuple[list[int], None, list[int]]`: The truncated `ids` and the list of
|
|
1010
1408
|
overflowing tokens. `None` is returned to match Transformers signature.
|
|
1011
1409
|
"""
|
|
1012
|
-
|
|
1410
|
+
if kwargs:
|
|
1411
|
+
raise ValueError(
|
|
1412
|
+
f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.truncate_sequences`."
|
|
1413
|
+
)
|
|
1013
1414
|
if pair_ids:
|
|
1014
1415
|
raise ValueError("`pair_ids` is not supported by `MistralCommonBackend.truncate_sequences`.")
|
|
1015
1416
|
|
|
1417
|
+
if num_tokens_to_remove <= 0:
|
|
1418
|
+
return (ids, None, [])
|
|
1419
|
+
|
|
1016
1420
|
if not isinstance(truncation_strategy, TruncationStrategy):
|
|
1017
1421
|
truncation_strategy = TruncationStrategy(truncation_strategy)
|
|
1018
1422
|
|
|
1019
|
-
if truncation_strategy in [
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
raise ValueError(f"{truncation_strategy=} is not supported by `MistralCommonBackend`.")
|
|
1024
|
-
|
|
1025
|
-
if num_tokens_to_remove <= 0:
|
|
1026
|
-
return ids, None, []
|
|
1423
|
+
if truncation_strategy in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]:
|
|
1424
|
+
raise ValueError(
|
|
1425
|
+
f"Only {TruncationStrategy.LONGEST_FIRST} and {TruncationStrategy.DO_NOT_TRUNCATE} are supported."
|
|
1426
|
+
)
|
|
1027
1427
|
|
|
1028
1428
|
overflowing_tokens = []
|
|
1029
|
-
|
|
1030
1429
|
if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1430
|
+
if len(ids) > num_tokens_to_remove:
|
|
1431
|
+
window_len = min(len(ids), stride + num_tokens_to_remove)
|
|
1432
|
+
if self.truncation_side == "left":
|
|
1433
|
+
overflowing_tokens = ids[:window_len]
|
|
1434
|
+
ids = ids[num_tokens_to_remove:]
|
|
1435
|
+
elif self.truncation_side == "right":
|
|
1436
|
+
overflowing_tokens = ids[-window_len:]
|
|
1437
|
+
ids = ids[:-num_tokens_to_remove]
|
|
1438
|
+
else:
|
|
1439
|
+
raise ValueError(f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.")
|
|
1440
|
+
|
|
1035
1441
|
else:
|
|
1036
|
-
|
|
1037
|
-
|
|
1442
|
+
error_msg = (
|
|
1443
|
+
f"We need to remove {num_tokens_to_remove} to truncate the input "
|
|
1444
|
+
f"but the first sequence has a length {len(ids)}. "
|
|
1445
|
+
)
|
|
1446
|
+
logger.error(error_msg)
|
|
1038
1447
|
|
|
1039
|
-
return ids, None, overflowing_tokens
|
|
1448
|
+
return (ids, None, overflowing_tokens)
|
|
1040
1449
|
|
|
1041
|
-
def apply_chat_template(
|
|
1450
|
+
def apply_chat_template(
|
|
1042
1451
|
self,
|
|
1043
1452
|
conversation: list[dict[str, str]] | list[list[dict[str, str]]],
|
|
1044
1453
|
tools: list[dict | Callable] | None = None,
|
|
@@ -1066,8 +1475,8 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1066
1475
|
[chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
|
|
1067
1476
|
for more information.
|
|
1068
1477
|
add_generation_prompt (`bool`, *optional*):
|
|
1069
|
-
This argument is a no-op for `MistralCommonBackend`. However
|
|
1070
|
-
|
|
1478
|
+
This argument is a no-op for `MistralCommonBackend`. However it cannot be used at the same time as `continue_final_message` to keep the API consistent and
|
|
1479
|
+
if any conversation ends with an assistant message, it will raise an error. In such case, use `continue_final_message` instead.
|
|
1071
1480
|
continue_final_message (bool, *optional*):
|
|
1072
1481
|
If this is set, the chat will be formatted so that the final
|
|
1073
1482
|
message in the chat is open-ended, without any EOS tokens. The model will continue this message
|
|
@@ -1102,7 +1511,8 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1102
1511
|
Will raise an error if used.
|
|
1103
1512
|
|
|
1104
1513
|
Returns:
|
|
1105
|
-
`Union[str, list[int], list[str], list[list[int]], BatchEncoding]`:
|
|
1514
|
+
`Union[str, list[int], list[str], list[list[int]], BatchEncoding]`: A list of token ids representing the tokenized chat so far, including control
|
|
1515
|
+
tokens. This output is ready to pass to the model, either directly or via methods like `generate()`.
|
|
1106
1516
|
"""
|
|
1107
1517
|
if kwargs:
|
|
1108
1518
|
raise ValueError(
|
|
@@ -1249,83 +1659,6 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1249
1659
|
)
|
|
1250
1660
|
return outputs
|
|
1251
1661
|
|
|
1252
|
-
def build_inputs_with_special_tokens(self, token_ids_0: list[int], token_ids_1: None = None) -> list[int]:
|
|
1253
|
-
"""
|
|
1254
|
-
Build model inputs from a sequence by adding special tokens.
|
|
1255
|
-
|
|
1256
|
-
This method dynamically builds inputs based on the tokenizer's `mode`:
|
|
1257
|
-
- `"test"`: seq0 [EOS]
|
|
1258
|
-
- `"finetuning"`: [BOS] seq0
|
|
1259
|
-
|
|
1260
|
-
Args:
|
|
1261
|
-
token_ids_0 (`list[int]`):
|
|
1262
|
-
List of IDs to which the special tokens will be added.
|
|
1263
|
-
token_ids_1 (`None`, *optional*): None, kept to match Transformers' signature.
|
|
1264
|
-
|
|
1265
|
-
Returns:
|
|
1266
|
-
`list[int]`: List of input IDs with the appropriate special tokens.
|
|
1267
|
-
"""
|
|
1268
|
-
if token_ids_1 is not None:
|
|
1269
|
-
raise ValueError(
|
|
1270
|
-
"`MistralCommonBackend` does not implement `token_ids_1 != None` for `build_inputs_with_special_tokens`."
|
|
1271
|
-
)
|
|
1272
|
-
|
|
1273
|
-
if self.mode == ValidationMode.test:
|
|
1274
|
-
# [BOS] seq0
|
|
1275
|
-
return [self.bos_token_id] + token_ids_0
|
|
1276
|
-
|
|
1277
|
-
else:
|
|
1278
|
-
# [BOS] seq0 [EOS]
|
|
1279
|
-
return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
|
|
1280
|
-
|
|
1281
|
-
def create_token_type_ids_from_sequences(self, token_ids_0: list[int], token_ids_1: None = None) -> list[int]:
|
|
1282
|
-
"""
|
|
1283
|
-
Create a mask of zeroes from the token ids with special tokens added.
|
|
1284
|
-
|
|
1285
|
-
Kept to match Transformers' implementation.
|
|
1286
|
-
|
|
1287
|
-
Args:
|
|
1288
|
-
token_ids_0 (`list[int]`):
|
|
1289
|
-
List of IDs.
|
|
1290
|
-
token_ids_1 (`None`, *optional*): None, kept to match Transformers' signature.
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
Returns:
|
|
1294
|
-
`list[int]`: Token type IDs according to the configured pattern.
|
|
1295
|
-
"""
|
|
1296
|
-
if token_ids_1 is not None:
|
|
1297
|
-
raise ValueError(
|
|
1298
|
-
"`MistralCommonBackend` does not implement `token_ids_1 != None` for `create_token_type_ids_from_sequences`."
|
|
1299
|
-
)
|
|
1300
|
-
|
|
1301
|
-
sequence = self.build_inputs_with_special_tokens(token_ids_0)
|
|
1302
|
-
|
|
1303
|
-
return [0] * len(sequence)
|
|
1304
|
-
|
|
1305
|
-
def num_special_tokens_to_add(self, pair: Literal[False] = False) -> int:
|
|
1306
|
-
"""
|
|
1307
|
-
Returns the number of added tokens when encoding a sequence with special tokens.
|
|
1308
|
-
|
|
1309
|
-
<Tip>
|
|
1310
|
-
|
|
1311
|
-
This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
|
|
1312
|
-
this inside your training loop.
|
|
1313
|
-
|
|
1314
|
-
</Tip>
|
|
1315
|
-
|
|
1316
|
-
Args:
|
|
1317
|
-
pair (`Literal[False]`, *optional*): False, kept to match Transformer's signature.
|
|
1318
|
-
|
|
1319
|
-
Returns:
|
|
1320
|
-
`int`: Number of special tokens added to sequences.
|
|
1321
|
-
"""
|
|
1322
|
-
if pair:
|
|
1323
|
-
raise ValueError(
|
|
1324
|
-
"`MistralCommonBackend` does not implement `pair = True` for `num_special_tokens_to_add`."
|
|
1325
|
-
)
|
|
1326
|
-
|
|
1327
|
-
return len(self.build_inputs_with_special_tokens([], None))
|
|
1328
|
-
|
|
1329
1662
|
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
|
|
1330
1663
|
def __call__(
|
|
1331
1664
|
self,
|
|
@@ -1346,8 +1679,6 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1346
1679
|
return_special_tokens_mask: bool = False,
|
|
1347
1680
|
return_length: bool = False,
|
|
1348
1681
|
verbose: bool = True,
|
|
1349
|
-
return_offsets_mapping: Literal[False] = False,
|
|
1350
|
-
split_special_tokens: Literal[False] = False,
|
|
1351
1682
|
**kwargs,
|
|
1352
1683
|
) -> BatchEncoding:
|
|
1353
1684
|
"""
|
|
@@ -1365,16 +1696,6 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1365
1696
|
text_pair_target (`None`, *optional*):
|
|
1366
1697
|
Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
|
|
1367
1698
|
"""
|
|
1368
|
-
if return_offsets_mapping or split_special_tokens:
|
|
1369
|
-
raise ValueError(
|
|
1370
|
-
"`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
|
|
1371
|
-
)
|
|
1372
|
-
|
|
1373
|
-
if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND, "only_first", "only_second"]:
|
|
1374
|
-
raise ValueError(
|
|
1375
|
-
"Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
|
|
1376
|
-
)
|
|
1377
|
-
|
|
1378
1699
|
if kwargs:
|
|
1379
1700
|
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.__call__`.")
|
|
1380
1701
|
|
|
@@ -1383,31 +1704,84 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1383
1704
|
"`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonBackend`."
|
|
1384
1705
|
)
|
|
1385
1706
|
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1707
|
+
def _is_valid_text_input(t):
|
|
1708
|
+
if isinstance(t, str):
|
|
1709
|
+
# Strings are fine
|
|
1710
|
+
return True
|
|
1711
|
+
elif isinstance(t, (list, tuple)):
|
|
1712
|
+
# List are fine as long as they are...
|
|
1713
|
+
if len(t) == 0:
|
|
1714
|
+
# ... empty
|
|
1715
|
+
return True
|
|
1716
|
+
elif isinstance(t[0], (str, int)):
|
|
1717
|
+
# ... list of strings or int
|
|
1718
|
+
return True
|
|
1719
|
+
elif isinstance(t[0], (list, tuple)):
|
|
1720
|
+
# ... list with an empty list or with a list of strings or with a list of ints
|
|
1721
|
+
return len(t[0]) == 0 or isinstance(t[0][0], (str, int))
|
|
1722
|
+
else:
|
|
1723
|
+
return False
|
|
1724
|
+
else:
|
|
1725
|
+
return False
|
|
1726
|
+
|
|
1727
|
+
if not _is_valid_text_input(text):
|
|
1728
|
+
raise ValueError(
|
|
1729
|
+
"text input must be of type `str` (single example), `list[str]` (batch or single encoded example) "
|
|
1730
|
+
"or `list[list[int]]` (batch of encoded examples)."
|
|
1731
|
+
)
|
|
1732
|
+
|
|
1733
|
+
is_batched = isinstance(text, (list, tuple)) and isinstance(text[0], (str, list, tuple))
|
|
1734
|
+
|
|
1735
|
+
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
|
|
1391
1736
|
padding=padding,
|
|
1392
1737
|
truncation=truncation,
|
|
1393
1738
|
max_length=max_length,
|
|
1394
|
-
stride=stride,
|
|
1395
1739
|
pad_to_multiple_of=pad_to_multiple_of,
|
|
1396
|
-
padding_side=padding_side,
|
|
1397
|
-
return_tensors=return_tensors,
|
|
1398
|
-
return_attention_mask=return_attention_mask,
|
|
1399
|
-
return_overflowing_tokens=return_overflowing_tokens,
|
|
1400
|
-
return_special_tokens_mask=return_special_tokens_mask,
|
|
1401
|
-
return_length=return_length,
|
|
1402
1740
|
verbose=verbose,
|
|
1741
|
+
**kwargs,
|
|
1403
1742
|
)
|
|
1404
1743
|
|
|
1744
|
+
if is_batched:
|
|
1745
|
+
return self._batch_encode_plus(
|
|
1746
|
+
batch_text=text,
|
|
1747
|
+
add_special_tokens=add_special_tokens,
|
|
1748
|
+
padding_strategy=padding_strategy,
|
|
1749
|
+
truncation_strategy=truncation_strategy,
|
|
1750
|
+
max_length=max_length,
|
|
1751
|
+
stride=stride,
|
|
1752
|
+
pad_to_multiple_of=pad_to_multiple_of,
|
|
1753
|
+
padding_side=padding_side,
|
|
1754
|
+
return_tensors=return_tensors,
|
|
1755
|
+
return_attention_mask=return_attention_mask,
|
|
1756
|
+
return_overflowing_tokens=return_overflowing_tokens,
|
|
1757
|
+
return_special_tokens_mask=return_special_tokens_mask,
|
|
1758
|
+
return_length=return_length,
|
|
1759
|
+
verbose=verbose,
|
|
1760
|
+
)
|
|
1761
|
+
else:
|
|
1762
|
+
return self._encode_plus(
|
|
1763
|
+
text=text,
|
|
1764
|
+
add_special_tokens=add_special_tokens,
|
|
1765
|
+
padding_strategy=padding_strategy,
|
|
1766
|
+
truncation_strategy=truncation_strategy,
|
|
1767
|
+
max_length=max_length,
|
|
1768
|
+
stride=stride,
|
|
1769
|
+
pad_to_multiple_of=pad_to_multiple_of,
|
|
1770
|
+
padding_side=padding_side,
|
|
1771
|
+
return_tensors=return_tensors,
|
|
1772
|
+
return_attention_mask=return_attention_mask,
|
|
1773
|
+
return_overflowing_tokens=return_overflowing_tokens,
|
|
1774
|
+
return_special_tokens_mask=return_special_tokens_mask,
|
|
1775
|
+
return_length=return_length,
|
|
1776
|
+
verbose=verbose,
|
|
1777
|
+
)
|
|
1778
|
+
|
|
1405
1779
|
@classmethod
|
|
1406
1780
|
def from_pretrained(
|
|
1407
1781
|
cls,
|
|
1408
1782
|
pretrained_model_name_or_path: str | os.PathLike,
|
|
1409
1783
|
*init_inputs,
|
|
1410
|
-
mode: str
|
|
1784
|
+
mode: Union[str, ValidationMode] = ValidationMode.test,
|
|
1411
1785
|
cache_dir: str | os.PathLike | None = None,
|
|
1412
1786
|
force_download: bool = False,
|
|
1413
1787
|
local_files_only: bool = False,
|
|
@@ -1434,9 +1808,9 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1434
1808
|
`./my_model_directory/`.
|
|
1435
1809
|
mode (`Union[str, ValidationMode]`, *optional*, defaults to `ValidationMode.test`):
|
|
1436
1810
|
Validation mode for the `MistralTokenizer` tokenizer. Possible values are:
|
|
1437
|
-
- `"finetuning"` or `ValidationMode.finetuning`: The
|
|
1811
|
+
- `"finetuning"` or `ValidationMode.finetuning`: The finetuning mode.
|
|
1438
1812
|
- `"test"` or `ValidationMode.test`: The test mode.
|
|
1439
|
-
It changes how the tokenizer validates the input and
|
|
1813
|
+
It changes how the tokenizer validates the input and prepare the request to the model.
|
|
1440
1814
|
cache_dir (`str` or `os.PathLike`, *optional*):
|
|
1441
1815
|
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
|
|
1442
1816
|
standard cache should not be used.
|
|
@@ -1463,11 +1837,11 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1463
1837
|
Default value is picked from the class attribute of the same name.
|
|
1464
1838
|
truncation_side (`str`, *optional*, defaults to `"right"`):
|
|
1465
1839
|
The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
|
|
1466
|
-
model_input_names (`List[
|
|
1840
|
+
model_input_names (`List[string]`, *optional*):
|
|
1467
1841
|
The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
|
|
1468
1842
|
`"attention_mask"`). Default value is picked from the class attribute of the same name.
|
|
1469
1843
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
|
|
1470
|
-
Whether or not the model should
|
|
1844
|
+
Whether or not the model should cleanup the spaces that were added when splitting the input text during the
|
|
1471
1845
|
tokenization process.
|
|
1472
1846
|
kwargs (additional keyword arguments, *optional*):
|
|
1473
1847
|
Not supported by `MistralCommonBackend.from_pretrained`.
|
|
@@ -1477,13 +1851,10 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1477
1851
|
raise ValueError("`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`.")
|
|
1478
1852
|
|
|
1479
1853
|
# Handle kwargs and AutoTokenizer/AutoProcessor case
|
|
1480
|
-
|
|
1481
|
-
{"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "
|
|
1482
|
-
)
|
|
1483
|
-
|
|
1484
|
-
raise ValueError(
|
|
1485
|
-
f"Some kwargs in {list(kwargs.keys())} are not supported by `MistralCommonBackend.from_pretrained`."
|
|
1486
|
-
)
|
|
1854
|
+
if kwargs and not set(kwargs.keys()).issubset(
|
|
1855
|
+
{"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "_from_auto"}
|
|
1856
|
+
):
|
|
1857
|
+
raise ValueError(f"Some kwargs in {kwargs} are not supported by `MistralCommonBackend.from_pretrained`.")
|
|
1487
1858
|
|
|
1488
1859
|
mode = cls._get_validation_mode(mode)
|
|
1489
1860
|
|
|
@@ -1497,8 +1868,35 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1497
1868
|
local_files_only=local_files_only,
|
|
1498
1869
|
)
|
|
1499
1870
|
else:
|
|
1500
|
-
|
|
1501
|
-
|
|
1871
|
+
valid_tokenizer_files = []
|
|
1872
|
+
tokenizer_file: str
|
|
1873
|
+
|
|
1874
|
+
instruct_versions = list(TokenizerVersion.__members__)
|
|
1875
|
+
mm_versions = list(MultiModalVersion.__members__) + [""] # allow no mm version
|
|
1876
|
+
sentencepiece_suffixes = [f".model.{v}{m}" for v in instruct_versions for m in mm_versions] + [".model"]
|
|
1877
|
+
|
|
1878
|
+
for path in os.listdir(pretrained_model_name_or_path):
|
|
1879
|
+
pathlib_repo_file = Path(path)
|
|
1880
|
+
file_name = pathlib_repo_file.name
|
|
1881
|
+
suffix = "".join(pathlib_repo_file.suffixes)
|
|
1882
|
+
if file_name == "tekken.json" or suffix in sentencepiece_suffixes:
|
|
1883
|
+
valid_tokenizer_files.append(file_name)
|
|
1884
|
+
|
|
1885
|
+
if len(valid_tokenizer_files) == 0:
|
|
1886
|
+
raise ValueError(f"No tokenizer file found in directory: {pretrained_model_name_or_path}")
|
|
1887
|
+
# If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
|
|
1888
|
+
if len(valid_tokenizer_files) > 1:
|
|
1889
|
+
if "tekken.json" in valid_tokenizer_files:
|
|
1890
|
+
tokenizer_file = "tekken.json"
|
|
1891
|
+
else:
|
|
1892
|
+
tokenizer_file = max(valid_tokenizer_files)
|
|
1893
|
+
logger.warning(
|
|
1894
|
+
f"Multiple tokenizer files found in directory: {pretrained_model_name_or_path}. Using {tokenizer_file}."
|
|
1895
|
+
)
|
|
1896
|
+
else:
|
|
1897
|
+
tokenizer_file = valid_tokenizer_files[0]
|
|
1898
|
+
|
|
1899
|
+
tokenizer_path = os.path.join(pretrained_model_name_or_path, tokenizer_file)
|
|
1502
1900
|
|
|
1503
1901
|
return cls(
|
|
1504
1902
|
tokenizer_path=tokenizer_path,
|
|
@@ -1510,7 +1908,7 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1510
1908
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
1511
1909
|
)
|
|
1512
1910
|
|
|
1513
|
-
def save_pretrained(
|
|
1911
|
+
def save_pretrained(
|
|
1514
1912
|
self,
|
|
1515
1913
|
save_directory: str | os.PathLike | Path,
|
|
1516
1914
|
push_to_hub: bool = False,
|
|
@@ -1572,7 +1970,7 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1572
1970
|
return (str(save_directory / self._tokenizer_path.name),)
|
|
1573
1971
|
|
|
1574
1972
|
@staticmethod
|
|
1575
|
-
def _get_validation_mode(mode: str
|
|
1973
|
+
def _get_validation_mode(mode: Union[str, ValidationMode]) -> ValidationMode:
|
|
1576
1974
|
"""Get the validation mode from a string or a ValidationMode."""
|
|
1577
1975
|
_invalid_mode_msg = (
|
|
1578
1976
|
f"Invalid `mistral-common` tokenizer mode: {mode}. Possible values are 'finetuning' or 'test'."
|
|
@@ -1588,66 +1986,3 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
|
1588
1986
|
if mode not in [ValidationMode.finetuning, ValidationMode.test]:
|
|
1589
1987
|
raise ValueError(_invalid_mode_msg)
|
|
1590
1988
|
return mode
|
|
1591
|
-
|
|
1592
|
-
def add_special_tokens(
|
|
1593
|
-
self,
|
|
1594
|
-
special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
|
|
1595
|
-
replace_extra_special_tokens: bool = True,
|
|
1596
|
-
):
|
|
1597
|
-
r"""`MistralCommonBackend` does not implement `add_special_tokens` by design.
|
|
1598
|
-
|
|
1599
|
-
If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
|
|
1600
|
-
"""
|
|
1601
|
-
|
|
1602
|
-
raise NotImplementedError("`MistralCommonBackend` does not implement `add_special_tokens`.")
|
|
1603
|
-
|
|
1604
|
-
def add_tokens( # type: ignore[override]
|
|
1605
|
-
self,
|
|
1606
|
-
special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
|
|
1607
|
-
replace_extra_special_tokens: bool = True,
|
|
1608
|
-
):
|
|
1609
|
-
"""
|
|
1610
|
-
`MistralCommonBackend` does not implement `add_special_tokens` by design.
|
|
1611
|
-
|
|
1612
|
-
If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
|
|
1613
|
-
"""
|
|
1614
|
-
|
|
1615
|
-
raise NotImplementedError("`MistralCommonBackend` does not implement `add_tokens`.")
|
|
1616
|
-
|
|
1617
|
-
def convert_added_tokens(cls, obj: AddedToken | Any, save: bool = False, add_type_field: bool = True): # type: ignore[override]
|
|
1618
|
-
"""
|
|
1619
|
-
`MistralCommonBackend` does not implement `convert_added_tokens` by design.
|
|
1620
|
-
|
|
1621
|
-
If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
|
|
1622
|
-
"""
|
|
1623
|
-
|
|
1624
|
-
raise NotImplementedError("`MistralCommonBackend` does not implement `convert_added_tokens`.")
|
|
1625
|
-
|
|
1626
|
-
def get_chat_template(self, chat_template: str | None = None, tools: list[dict] | None = None) -> str:
|
|
1627
|
-
"""`MistralCommonBackend` does not implement `get_chat_template` by design as `mistral-common` does not use chat templates."""
|
|
1628
|
-
|
|
1629
|
-
raise NotImplementedError("`MistralCommonBackend` does not implement `get_chat_template`.")
|
|
1630
|
-
|
|
1631
|
-
def save_chat_templates(
|
|
1632
|
-
self,
|
|
1633
|
-
save_directory: str | os.PathLike,
|
|
1634
|
-
tokenizer_config: dict,
|
|
1635
|
-
filename_prefix: str | None,
|
|
1636
|
-
save_jinja_files: bool,
|
|
1637
|
-
):
|
|
1638
|
-
"""`MistralCommonBackend` does not implement `save_chat_templates` by design as `mistral-common` does not use chat templates."""
|
|
1639
|
-
|
|
1640
|
-
raise NotImplementedError("`MistralCommonBackend` does not implement `save_chat_templates`.")
|
|
1641
|
-
|
|
1642
|
-
def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str, ...]:
|
|
1643
|
-
"""
|
|
1644
|
-
`MistralCommonBackend` does not implement `save_vocabulary` by design.
|
|
1645
|
-
|
|
1646
|
-
This is because `mistral-common` is configured by one tokenizer file. If you'd like to save the vocabulary, please consider using the `save_pretrained` method instead.
|
|
1647
|
-
"""
|
|
1648
|
-
|
|
1649
|
-
raise NotImplementedError("`MistralCommonBackend` does not implement `save_vocabulary`.")
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
# Backward compatibility alias for codebases still importing the legacy name.
|
|
1653
|
-
MistralCommonTokenizer = MistralCommonBackend
|