transformers 5.0.0rc2__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +11 -37
- transformers/activations.py +2 -2
- transformers/audio_utils.py +32 -32
- transformers/backbone_utils.py +326 -0
- transformers/cache_utils.py +26 -126
- transformers/cli/chat.py +3 -3
- transformers/cli/serve.py +13 -10
- transformers/cli/transformers.py +2 -1
- transformers/configuration_utils.py +22 -92
- transformers/conversion_mapping.py +150 -26
- transformers/convert_slow_tokenizer.py +9 -12
- transformers/core_model_loading.py +217 -129
- transformers/data/processors/glue.py +0 -1
- transformers/data/processors/utils.py +0 -1
- transformers/data/processors/xnli.py +0 -1
- transformers/dependency_versions_check.py +0 -1
- transformers/dependency_versions_table.py +10 -11
- transformers/distributed/configuration_utils.py +1 -2
- transformers/dynamic_module_utils.py +23 -23
- transformers/feature_extraction_sequence_utils.py +19 -23
- transformers/feature_extraction_utils.py +14 -14
- transformers/file_utils.py +0 -2
- transformers/generation/candidate_generator.py +2 -4
- transformers/generation/configuration_utils.py +54 -39
- transformers/generation/continuous_batching/__init__.py +0 -1
- transformers/generation/continuous_batching/cache.py +74 -44
- transformers/generation/continuous_batching/cache_manager.py +28 -28
- transformers/generation/continuous_batching/continuous_api.py +133 -414
- transformers/generation/continuous_batching/input_ouputs.py +464 -0
- transformers/generation/continuous_batching/requests.py +77 -19
- transformers/generation/continuous_batching/scheduler.py +154 -104
- transformers/generation/logits_process.py +10 -133
- transformers/generation/stopping_criteria.py +1 -2
- transformers/generation/streamers.py +0 -1
- transformers/generation/utils.py +91 -121
- transformers/generation/watermarking.py +2 -3
- transformers/hf_argparser.py +9 -13
- transformers/hyperparameter_search.py +1 -2
- transformers/image_processing_base.py +9 -9
- transformers/image_processing_utils.py +11 -15
- transformers/image_processing_utils_fast.py +70 -71
- transformers/image_transforms.py +73 -42
- transformers/image_utils.py +30 -37
- transformers/initialization.py +57 -0
- transformers/integrations/__init__.py +10 -24
- transformers/integrations/accelerate.py +47 -11
- transformers/integrations/awq.py +1 -3
- transformers/integrations/deepspeed.py +146 -4
- transformers/integrations/eetq.py +0 -1
- transformers/integrations/executorch.py +2 -6
- transformers/integrations/fbgemm_fp8.py +1 -2
- transformers/integrations/finegrained_fp8.py +149 -13
- transformers/integrations/flash_attention.py +3 -8
- transformers/integrations/flex_attention.py +1 -1
- transformers/integrations/fp_quant.py +4 -6
- transformers/integrations/ggml.py +0 -1
- transformers/integrations/hub_kernels.py +18 -7
- transformers/integrations/integration_utils.py +2 -3
- transformers/integrations/moe.py +226 -106
- transformers/integrations/mxfp4.py +52 -40
- transformers/integrations/peft.py +488 -176
- transformers/integrations/quark.py +2 -4
- transformers/integrations/tensor_parallel.py +641 -581
- transformers/integrations/torchao.py +4 -6
- transformers/loss/loss_lw_detr.py +356 -0
- transformers/loss/loss_utils.py +2 -0
- transformers/masking_utils.py +199 -59
- transformers/model_debugging_utils.py +4 -5
- transformers/modelcard.py +14 -192
- transformers/modeling_attn_mask_utils.py +19 -19
- transformers/modeling_flash_attention_utils.py +28 -29
- transformers/modeling_gguf_pytorch_utils.py +5 -5
- transformers/modeling_layers.py +21 -22
- transformers/modeling_outputs.py +242 -253
- transformers/modeling_rope_utils.py +32 -32
- transformers/modeling_utils.py +416 -438
- transformers/models/__init__.py +10 -0
- transformers/models/afmoe/configuration_afmoe.py +40 -33
- transformers/models/afmoe/modeling_afmoe.py +38 -41
- transformers/models/afmoe/modular_afmoe.py +23 -25
- transformers/models/aimv2/configuration_aimv2.py +2 -10
- transformers/models/aimv2/modeling_aimv2.py +46 -45
- transformers/models/aimv2/modular_aimv2.py +13 -19
- transformers/models/albert/configuration_albert.py +8 -2
- transformers/models/albert/modeling_albert.py +70 -72
- transformers/models/albert/tokenization_albert.py +1 -4
- transformers/models/align/configuration_align.py +8 -6
- transformers/models/align/modeling_align.py +83 -86
- transformers/models/align/processing_align.py +2 -30
- transformers/models/altclip/configuration_altclip.py +4 -7
- transformers/models/altclip/modeling_altclip.py +106 -103
- transformers/models/altclip/processing_altclip.py +2 -15
- transformers/models/apertus/__init__.py +0 -1
- transformers/models/apertus/configuration_apertus.py +23 -28
- transformers/models/apertus/modeling_apertus.py +35 -38
- transformers/models/apertus/modular_apertus.py +36 -40
- transformers/models/arcee/configuration_arcee.py +25 -30
- transformers/models/arcee/modeling_arcee.py +35 -38
- transformers/models/arcee/modular_arcee.py +20 -23
- transformers/models/aria/configuration_aria.py +31 -44
- transformers/models/aria/image_processing_aria.py +25 -27
- transformers/models/aria/modeling_aria.py +102 -102
- transformers/models/aria/modular_aria.py +111 -124
- transformers/models/aria/processing_aria.py +28 -35
- transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +0 -1
- transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +3 -6
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +9 -11
- transformers/models/audioflamingo3/__init__.py +0 -1
- transformers/models/audioflamingo3/configuration_audioflamingo3.py +0 -1
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +60 -52
- transformers/models/audioflamingo3/modular_audioflamingo3.py +52 -43
- transformers/models/audioflamingo3/processing_audioflamingo3.py +6 -8
- transformers/models/auto/auto_factory.py +12 -11
- transformers/models/auto/configuration_auto.py +48 -5
- transformers/models/auto/feature_extraction_auto.py +5 -7
- transformers/models/auto/image_processing_auto.py +30 -39
- transformers/models/auto/modeling_auto.py +33 -199
- transformers/models/auto/processing_auto.py +11 -19
- transformers/models/auto/tokenization_auto.py +38 -37
- transformers/models/auto/video_processing_auto.py +7 -8
- transformers/models/autoformer/configuration_autoformer.py +4 -7
- transformers/models/autoformer/modeling_autoformer.py +100 -101
- transformers/models/aya_vision/configuration_aya_vision.py +4 -1
- transformers/models/aya_vision/modeling_aya_vision.py +64 -99
- transformers/models/aya_vision/modular_aya_vision.py +46 -74
- transformers/models/aya_vision/processing_aya_vision.py +25 -53
- transformers/models/bamba/configuration_bamba.py +46 -39
- transformers/models/bamba/modeling_bamba.py +83 -119
- transformers/models/bamba/modular_bamba.py +70 -109
- transformers/models/bark/configuration_bark.py +6 -8
- transformers/models/bark/generation_configuration_bark.py +3 -5
- transformers/models/bark/modeling_bark.py +64 -65
- transformers/models/bark/processing_bark.py +19 -41
- transformers/models/bart/configuration_bart.py +9 -5
- transformers/models/bart/modeling_bart.py +124 -129
- transformers/models/barthez/tokenization_barthez.py +1 -4
- transformers/models/bartpho/tokenization_bartpho.py +6 -7
- transformers/models/beit/configuration_beit.py +2 -15
- transformers/models/beit/image_processing_beit.py +53 -56
- transformers/models/beit/image_processing_beit_fast.py +11 -12
- transformers/models/beit/modeling_beit.py +65 -62
- transformers/models/bert/configuration_bert.py +12 -2
- transformers/models/bert/modeling_bert.py +117 -152
- transformers/models/bert/tokenization_bert.py +2 -4
- transformers/models/bert/tokenization_bert_legacy.py +3 -5
- transformers/models/bert_generation/configuration_bert_generation.py +17 -2
- transformers/models/bert_generation/modeling_bert_generation.py +53 -55
- transformers/models/bert_generation/tokenization_bert_generation.py +2 -3
- transformers/models/bert_japanese/tokenization_bert_japanese.py +5 -6
- transformers/models/bertweet/tokenization_bertweet.py +1 -3
- transformers/models/big_bird/configuration_big_bird.py +12 -9
- transformers/models/big_bird/modeling_big_bird.py +107 -124
- transformers/models/big_bird/tokenization_big_bird.py +1 -4
- transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -9
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +118 -118
- transformers/models/biogpt/configuration_biogpt.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +73 -79
- transformers/models/biogpt/modular_biogpt.py +60 -66
- transformers/models/biogpt/tokenization_biogpt.py +3 -5
- transformers/models/bit/configuration_bit.py +2 -5
- transformers/models/bit/image_processing_bit.py +21 -24
- transformers/models/bit/image_processing_bit_fast.py +0 -1
- transformers/models/bit/modeling_bit.py +15 -16
- transformers/models/bitnet/configuration_bitnet.py +23 -28
- transformers/models/bitnet/modeling_bitnet.py +34 -38
- transformers/models/bitnet/modular_bitnet.py +7 -10
- transformers/models/blenderbot/configuration_blenderbot.py +8 -5
- transformers/models/blenderbot/modeling_blenderbot.py +68 -99
- transformers/models/blenderbot/tokenization_blenderbot.py +0 -1
- transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -5
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +70 -72
- transformers/models/blenderbot_small/tokenization_blenderbot_small.py +1 -3
- transformers/models/blip/configuration_blip.py +9 -10
- transformers/models/blip/image_processing_blip.py +17 -20
- transformers/models/blip/image_processing_blip_fast.py +0 -1
- transformers/models/blip/modeling_blip.py +115 -108
- transformers/models/blip/modeling_blip_text.py +63 -65
- transformers/models/blip/processing_blip.py +5 -36
- transformers/models/blip_2/configuration_blip_2.py +2 -2
- transformers/models/blip_2/modeling_blip_2.py +145 -121
- transformers/models/blip_2/processing_blip_2.py +8 -38
- transformers/models/bloom/configuration_bloom.py +5 -2
- transformers/models/bloom/modeling_bloom.py +60 -60
- transformers/models/blt/configuration_blt.py +94 -86
- transformers/models/blt/modeling_blt.py +93 -90
- transformers/models/blt/modular_blt.py +127 -69
- transformers/models/bridgetower/configuration_bridgetower.py +7 -2
- transformers/models/bridgetower/image_processing_bridgetower.py +34 -35
- transformers/models/bridgetower/image_processing_bridgetower_fast.py +13 -14
- transformers/models/bridgetower/modeling_bridgetower.py +136 -124
- transformers/models/bridgetower/processing_bridgetower.py +2 -16
- transformers/models/bros/configuration_bros.py +24 -18
- transformers/models/bros/modeling_bros.py +78 -80
- transformers/models/bros/processing_bros.py +2 -12
- transformers/models/byt5/tokenization_byt5.py +4 -6
- transformers/models/camembert/configuration_camembert.py +8 -2
- transformers/models/camembert/modeling_camembert.py +97 -99
- transformers/models/camembert/modular_camembert.py +51 -54
- transformers/models/camembert/tokenization_camembert.py +1 -4
- transformers/models/canine/configuration_canine.py +4 -2
- transformers/models/canine/modeling_canine.py +73 -75
- transformers/models/canine/tokenization_canine.py +0 -1
- transformers/models/chameleon/configuration_chameleon.py +29 -34
- transformers/models/chameleon/image_processing_chameleon.py +21 -24
- transformers/models/chameleon/image_processing_chameleon_fast.py +5 -6
- transformers/models/chameleon/modeling_chameleon.py +135 -92
- transformers/models/chameleon/processing_chameleon.py +16 -41
- transformers/models/chinese_clip/configuration_chinese_clip.py +10 -8
- transformers/models/chinese_clip/image_processing_chinese_clip.py +21 -24
- transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +0 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +93 -95
- transformers/models/chinese_clip/processing_chinese_clip.py +2 -15
- transformers/models/clap/configuration_clap.py +4 -9
- transformers/models/clap/feature_extraction_clap.py +9 -10
- transformers/models/clap/modeling_clap.py +109 -111
- transformers/models/clap/processing_clap.py +2 -15
- transformers/models/clip/configuration_clip.py +4 -2
- transformers/models/clip/image_processing_clip.py +21 -24
- transformers/models/clip/image_processing_clip_fast.py +9 -1
- transformers/models/clip/modeling_clip.py +70 -68
- transformers/models/clip/processing_clip.py +2 -14
- transformers/models/clip/tokenization_clip.py +2 -5
- transformers/models/clipseg/configuration_clipseg.py +4 -2
- transformers/models/clipseg/modeling_clipseg.py +113 -112
- transformers/models/clipseg/processing_clipseg.py +19 -42
- transformers/models/clvp/configuration_clvp.py +15 -5
- transformers/models/clvp/feature_extraction_clvp.py +7 -10
- transformers/models/clvp/modeling_clvp.py +138 -145
- transformers/models/clvp/number_normalizer.py +1 -2
- transformers/models/clvp/processing_clvp.py +3 -20
- transformers/models/clvp/tokenization_clvp.py +0 -1
- transformers/models/code_llama/tokenization_code_llama.py +3 -6
- transformers/models/codegen/configuration_codegen.py +4 -4
- transformers/models/codegen/modeling_codegen.py +50 -49
- transformers/models/codegen/tokenization_codegen.py +5 -6
- transformers/models/cohere/configuration_cohere.py +25 -30
- transformers/models/cohere/modeling_cohere.py +39 -42
- transformers/models/cohere/modular_cohere.py +27 -31
- transformers/models/cohere/tokenization_cohere.py +5 -6
- transformers/models/cohere2/configuration_cohere2.py +27 -32
- transformers/models/cohere2/modeling_cohere2.py +38 -41
- transformers/models/cohere2/modular_cohere2.py +48 -52
- transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +9 -10
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +52 -55
- transformers/models/cohere2_vision/modular_cohere2_vision.py +41 -43
- transformers/models/cohere2_vision/processing_cohere2_vision.py +6 -36
- transformers/models/colpali/configuration_colpali.py +0 -1
- transformers/models/colpali/modeling_colpali.py +14 -16
- transformers/models/colpali/modular_colpali.py +11 -51
- transformers/models/colpali/processing_colpali.py +14 -52
- transformers/models/colqwen2/modeling_colqwen2.py +27 -28
- transformers/models/colqwen2/modular_colqwen2.py +36 -74
- transformers/models/colqwen2/processing_colqwen2.py +16 -52
- transformers/models/conditional_detr/configuration_conditional_detr.py +19 -47
- transformers/models/conditional_detr/image_processing_conditional_detr.py +67 -70
- transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +50 -36
- transformers/models/conditional_detr/modeling_conditional_detr.py +851 -1001
- transformers/models/conditional_detr/modular_conditional_detr.py +901 -5
- transformers/models/convbert/configuration_convbert.py +11 -8
- transformers/models/convbert/modeling_convbert.py +85 -87
- transformers/models/convbert/tokenization_convbert.py +0 -1
- transformers/models/convnext/configuration_convnext.py +2 -5
- transformers/models/convnext/image_processing_convnext.py +18 -21
- transformers/models/convnext/image_processing_convnext_fast.py +7 -8
- transformers/models/convnext/modeling_convnext.py +12 -14
- transformers/models/convnextv2/configuration_convnextv2.py +2 -5
- transformers/models/convnextv2/modeling_convnextv2.py +12 -14
- transformers/models/cpm/tokenization_cpm.py +6 -7
- transformers/models/cpm/tokenization_cpm_fast.py +3 -5
- transformers/models/cpmant/configuration_cpmant.py +4 -1
- transformers/models/cpmant/modeling_cpmant.py +38 -40
- transformers/models/cpmant/tokenization_cpmant.py +1 -3
- transformers/models/csm/configuration_csm.py +58 -66
- transformers/models/csm/generation_csm.py +13 -14
- transformers/models/csm/modeling_csm.py +81 -84
- transformers/models/csm/modular_csm.py +56 -58
- transformers/models/csm/processing_csm.py +25 -68
- transformers/models/ctrl/configuration_ctrl.py +16 -1
- transformers/models/ctrl/modeling_ctrl.py +51 -66
- transformers/models/ctrl/tokenization_ctrl.py +0 -1
- transformers/models/cvt/configuration_cvt.py +0 -1
- transformers/models/cvt/modeling_cvt.py +13 -15
- transformers/models/cwm/__init__.py +0 -1
- transformers/models/cwm/configuration_cwm.py +8 -12
- transformers/models/cwm/modeling_cwm.py +36 -38
- transformers/models/cwm/modular_cwm.py +10 -12
- transformers/models/d_fine/configuration_d_fine.py +10 -57
- transformers/models/d_fine/modeling_d_fine.py +786 -927
- transformers/models/d_fine/modular_d_fine.py +339 -417
- transformers/models/dab_detr/configuration_dab_detr.py +22 -49
- transformers/models/dab_detr/modeling_dab_detr.py +79 -77
- transformers/models/dac/configuration_dac.py +0 -1
- transformers/models/dac/feature_extraction_dac.py +6 -9
- transformers/models/dac/modeling_dac.py +22 -24
- transformers/models/data2vec/configuration_data2vec_audio.py +4 -2
- transformers/models/data2vec/configuration_data2vec_text.py +11 -3
- transformers/models/data2vec/configuration_data2vec_vision.py +0 -1
- transformers/models/data2vec/modeling_data2vec_audio.py +55 -59
- transformers/models/data2vec/modeling_data2vec_text.py +97 -99
- transformers/models/data2vec/modeling_data2vec_vision.py +45 -44
- transformers/models/data2vec/modular_data2vec_audio.py +6 -1
- transformers/models/data2vec/modular_data2vec_text.py +51 -54
- transformers/models/dbrx/configuration_dbrx.py +29 -22
- transformers/models/dbrx/modeling_dbrx.py +45 -48
- transformers/models/dbrx/modular_dbrx.py +37 -39
- transformers/models/deberta/configuration_deberta.py +6 -1
- transformers/models/deberta/modeling_deberta.py +57 -60
- transformers/models/deberta/tokenization_deberta.py +2 -5
- transformers/models/deberta_v2/configuration_deberta_v2.py +6 -1
- transformers/models/deberta_v2/modeling_deberta_v2.py +63 -65
- transformers/models/deberta_v2/tokenization_deberta_v2.py +1 -4
- transformers/models/decision_transformer/configuration_decision_transformer.py +3 -2
- transformers/models/decision_transformer/modeling_decision_transformer.py +51 -53
- transformers/models/deepseek_v2/configuration_deepseek_v2.py +41 -47
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +39 -41
- transformers/models/deepseek_v2/modular_deepseek_v2.py +48 -52
- transformers/models/deepseek_v3/configuration_deepseek_v3.py +42 -48
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +38 -40
- transformers/models/deepseek_v3/modular_deepseek_v3.py +10 -10
- transformers/models/deepseek_vl/configuration_deepseek_vl.py +6 -3
- transformers/models/deepseek_vl/image_processing_deepseek_vl.py +27 -28
- transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +12 -11
- transformers/models/deepseek_vl/modeling_deepseek_vl.py +48 -43
- transformers/models/deepseek_vl/modular_deepseek_vl.py +15 -43
- transformers/models/deepseek_vl/processing_deepseek_vl.py +10 -41
- transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +7 -5
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +37 -37
- transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +22 -22
- transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +100 -56
- transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +141 -109
- transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +12 -44
- transformers/models/deformable_detr/configuration_deformable_detr.py +22 -46
- transformers/models/deformable_detr/image_processing_deformable_detr.py +59 -61
- transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +42 -28
- transformers/models/deformable_detr/modeling_deformable_detr.py +454 -652
- transformers/models/deformable_detr/modular_deformable_detr.py +1385 -5
- transformers/models/deit/configuration_deit.py +0 -1
- transformers/models/deit/image_processing_deit.py +18 -21
- transformers/models/deit/image_processing_deit_fast.py +0 -1
- transformers/models/deit/modeling_deit.py +27 -25
- transformers/models/depth_anything/configuration_depth_anything.py +12 -43
- transformers/models/depth_anything/modeling_depth_anything.py +10 -11
- transformers/models/depth_pro/configuration_depth_pro.py +0 -1
- transformers/models/depth_pro/image_processing_depth_pro.py +22 -23
- transformers/models/depth_pro/image_processing_depth_pro_fast.py +8 -9
- transformers/models/depth_pro/modeling_depth_pro.py +29 -27
- transformers/models/detr/configuration_detr.py +18 -50
- transformers/models/detr/image_processing_detr.py +64 -66
- transformers/models/detr/image_processing_detr_fast.py +33 -34
- transformers/models/detr/modeling_detr.py +748 -789
- transformers/models/dia/configuration_dia.py +9 -15
- transformers/models/dia/feature_extraction_dia.py +6 -9
- transformers/models/dia/generation_dia.py +48 -53
- transformers/models/dia/modeling_dia.py +68 -71
- transformers/models/dia/modular_dia.py +56 -58
- transformers/models/dia/processing_dia.py +39 -29
- transformers/models/dia/tokenization_dia.py +3 -6
- transformers/models/diffllama/configuration_diffllama.py +25 -30
- transformers/models/diffllama/modeling_diffllama.py +45 -53
- transformers/models/diffllama/modular_diffllama.py +18 -25
- transformers/models/dinat/configuration_dinat.py +2 -5
- transformers/models/dinat/modeling_dinat.py +47 -48
- transformers/models/dinov2/configuration_dinov2.py +2 -5
- transformers/models/dinov2/modeling_dinov2.py +20 -21
- transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +3 -5
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +21 -21
- transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +11 -14
- transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +6 -11
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +5 -9
- transformers/models/dinov3_vit/configuration_dinov3_vit.py +7 -12
- transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +7 -8
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +19 -22
- transformers/models/dinov3_vit/modular_dinov3_vit.py +16 -19
- transformers/models/distilbert/configuration_distilbert.py +8 -2
- transformers/models/distilbert/modeling_distilbert.py +47 -49
- transformers/models/distilbert/tokenization_distilbert.py +0 -1
- transformers/models/doge/__init__.py +0 -1
- transformers/models/doge/configuration_doge.py +42 -35
- transformers/models/doge/modeling_doge.py +46 -49
- transformers/models/doge/modular_doge.py +77 -68
- transformers/models/donut/configuration_donut_swin.py +0 -1
- transformers/models/donut/image_processing_donut.py +26 -29
- transformers/models/donut/image_processing_donut_fast.py +9 -14
- transformers/models/donut/modeling_donut_swin.py +44 -46
- transformers/models/donut/processing_donut.py +5 -26
- transformers/models/dots1/configuration_dots1.py +43 -36
- transformers/models/dots1/modeling_dots1.py +35 -38
- transformers/models/dots1/modular_dots1.py +0 -1
- transformers/models/dpr/configuration_dpr.py +19 -2
- transformers/models/dpr/modeling_dpr.py +37 -39
- transformers/models/dpr/tokenization_dpr.py +7 -9
- transformers/models/dpr/tokenization_dpr_fast.py +7 -9
- transformers/models/dpt/configuration_dpt.py +23 -66
- transformers/models/dpt/image_processing_dpt.py +65 -66
- transformers/models/dpt/image_processing_dpt_fast.py +18 -19
- transformers/models/dpt/modeling_dpt.py +38 -36
- transformers/models/dpt/modular_dpt.py +14 -15
- transformers/models/edgetam/configuration_edgetam.py +1 -2
- transformers/models/edgetam/modeling_edgetam.py +87 -89
- transformers/models/edgetam/modular_edgetam.py +7 -13
- transformers/models/edgetam_video/__init__.py +0 -1
- transformers/models/edgetam_video/configuration_edgetam_video.py +0 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +126 -128
- transformers/models/edgetam_video/modular_edgetam_video.py +25 -27
- transformers/models/efficientloftr/configuration_efficientloftr.py +4 -5
- transformers/models/efficientloftr/image_processing_efficientloftr.py +14 -16
- transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +8 -7
- transformers/models/efficientloftr/modeling_efficientloftr.py +46 -38
- transformers/models/efficientloftr/modular_efficientloftr.py +1 -3
- transformers/models/efficientnet/configuration_efficientnet.py +0 -1
- transformers/models/efficientnet/image_processing_efficientnet.py +23 -26
- transformers/models/efficientnet/image_processing_efficientnet_fast.py +16 -17
- transformers/models/efficientnet/modeling_efficientnet.py +12 -14
- transformers/models/electra/configuration_electra.py +13 -3
- transformers/models/electra/modeling_electra.py +107 -109
- transformers/models/emu3/configuration_emu3.py +17 -17
- transformers/models/emu3/image_processing_emu3.py +44 -39
- transformers/models/emu3/modeling_emu3.py +143 -109
- transformers/models/emu3/modular_emu3.py +109 -73
- transformers/models/emu3/processing_emu3.py +18 -43
- transformers/models/encodec/configuration_encodec.py +2 -4
- transformers/models/encodec/feature_extraction_encodec.py +10 -13
- transformers/models/encodec/modeling_encodec.py +25 -29
- transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -2
- transformers/models/encoder_decoder/modeling_encoder_decoder.py +37 -43
- transformers/models/eomt/configuration_eomt.py +12 -14
- transformers/models/eomt/image_processing_eomt.py +53 -55
- transformers/models/eomt/image_processing_eomt_fast.py +18 -19
- transformers/models/eomt/modeling_eomt.py +19 -21
- transformers/models/eomt/modular_eomt.py +28 -30
- transformers/models/eomt_dinov3/__init__.py +28 -0
- transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
- transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
- transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
- transformers/models/ernie/configuration_ernie.py +24 -3
- transformers/models/ernie/modeling_ernie.py +127 -162
- transformers/models/ernie/modular_ernie.py +91 -103
- transformers/models/ernie4_5/configuration_ernie4_5.py +23 -27
- transformers/models/ernie4_5/modeling_ernie4_5.py +35 -37
- transformers/models/ernie4_5/modular_ernie4_5.py +1 -3
- transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +34 -39
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +40 -42
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +7 -9
- transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -7
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +34 -35
- transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +6 -7
- transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +305 -267
- transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +163 -142
- transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +3 -5
- transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +17 -18
- transformers/models/esm/configuration_esm.py +11 -15
- transformers/models/esm/modeling_esm.py +35 -37
- transformers/models/esm/modeling_esmfold.py +43 -50
- transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
- transformers/models/esm/openfold_utils/loss.py +1 -2
- transformers/models/esm/openfold_utils/protein.py +15 -16
- transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
- transformers/models/esm/tokenization_esm.py +2 -4
- transformers/models/evolla/configuration_evolla.py +50 -40
- transformers/models/evolla/modeling_evolla.py +69 -68
- transformers/models/evolla/modular_evolla.py +50 -48
- transformers/models/evolla/processing_evolla.py +23 -35
- transformers/models/exaone4/configuration_exaone4.py +27 -27
- transformers/models/exaone4/modeling_exaone4.py +36 -39
- transformers/models/exaone4/modular_exaone4.py +51 -50
- transformers/models/exaone_moe/__init__.py +27 -0
- transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
- transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
- transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
- transformers/models/falcon/configuration_falcon.py +31 -26
- transformers/models/falcon/modeling_falcon.py +76 -84
- transformers/models/falcon_h1/configuration_falcon_h1.py +57 -51
- transformers/models/falcon_h1/modeling_falcon_h1.py +74 -109
- transformers/models/falcon_h1/modular_falcon_h1.py +68 -100
- transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -2
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +64 -73
- transformers/models/falcon_mamba/modular_falcon_mamba.py +14 -13
- transformers/models/fast_vlm/configuration_fast_vlm.py +10 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +70 -97
- transformers/models/fast_vlm/modular_fast_vlm.py +148 -38
- transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +2 -6
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +45 -47
- transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -3
- transformers/models/flaubert/configuration_flaubert.py +10 -5
- transformers/models/flaubert/modeling_flaubert.py +125 -129
- transformers/models/flaubert/tokenization_flaubert.py +3 -5
- transformers/models/flava/configuration_flava.py +9 -9
- transformers/models/flava/image_processing_flava.py +66 -67
- transformers/models/flava/image_processing_flava_fast.py +46 -47
- transformers/models/flava/modeling_flava.py +144 -135
- transformers/models/flava/processing_flava.py +2 -12
- transformers/models/flex_olmo/__init__.py +0 -1
- transformers/models/flex_olmo/configuration_flex_olmo.py +34 -39
- transformers/models/flex_olmo/modeling_flex_olmo.py +41 -43
- transformers/models/flex_olmo/modular_flex_olmo.py +46 -51
- transformers/models/florence2/configuration_florence2.py +4 -1
- transformers/models/florence2/modeling_florence2.py +96 -72
- transformers/models/florence2/modular_florence2.py +100 -107
- transformers/models/florence2/processing_florence2.py +18 -47
- transformers/models/fnet/configuration_fnet.py +6 -2
- transformers/models/fnet/modeling_fnet.py +69 -80
- transformers/models/fnet/tokenization_fnet.py +0 -1
- transformers/models/focalnet/configuration_focalnet.py +2 -5
- transformers/models/focalnet/modeling_focalnet.py +49 -48
- transformers/models/fsmt/configuration_fsmt.py +12 -17
- transformers/models/fsmt/modeling_fsmt.py +47 -48
- transformers/models/fsmt/tokenization_fsmt.py +3 -5
- transformers/models/funnel/configuration_funnel.py +8 -1
- transformers/models/funnel/modeling_funnel.py +91 -93
- transformers/models/funnel/tokenization_funnel.py +2 -5
- transformers/models/fuyu/configuration_fuyu.py +28 -34
- transformers/models/fuyu/image_processing_fuyu.py +29 -31
- transformers/models/fuyu/image_processing_fuyu_fast.py +17 -17
- transformers/models/fuyu/modeling_fuyu.py +50 -52
- transformers/models/fuyu/processing_fuyu.py +9 -36
- transformers/models/gemma/configuration_gemma.py +25 -30
- transformers/models/gemma/modeling_gemma.py +36 -38
- transformers/models/gemma/modular_gemma.py +33 -36
- transformers/models/gemma/tokenization_gemma.py +3 -6
- transformers/models/gemma2/configuration_gemma2.py +30 -35
- transformers/models/gemma2/modeling_gemma2.py +38 -41
- transformers/models/gemma2/modular_gemma2.py +63 -67
- transformers/models/gemma3/configuration_gemma3.py +53 -48
- transformers/models/gemma3/image_processing_gemma3.py +29 -31
- transformers/models/gemma3/image_processing_gemma3_fast.py +11 -12
- transformers/models/gemma3/modeling_gemma3.py +123 -122
- transformers/models/gemma3/modular_gemma3.py +128 -125
- transformers/models/gemma3/processing_gemma3.py +5 -5
- transformers/models/gemma3n/configuration_gemma3n.py +42 -30
- transformers/models/gemma3n/feature_extraction_gemma3n.py +9 -11
- transformers/models/gemma3n/modeling_gemma3n.py +166 -147
- transformers/models/gemma3n/modular_gemma3n.py +176 -148
- transformers/models/gemma3n/processing_gemma3n.py +12 -26
- transformers/models/git/configuration_git.py +5 -8
- transformers/models/git/modeling_git.py +115 -127
- transformers/models/git/processing_git.py +2 -14
- transformers/models/glm/configuration_glm.py +26 -30
- transformers/models/glm/modeling_glm.py +36 -39
- transformers/models/glm/modular_glm.py +4 -7
- transformers/models/glm4/configuration_glm4.py +26 -30
- transformers/models/glm4/modeling_glm4.py +39 -41
- transformers/models/glm4/modular_glm4.py +8 -10
- transformers/models/glm46v/configuration_glm46v.py +4 -1
- transformers/models/glm46v/image_processing_glm46v.py +40 -38
- transformers/models/glm46v/image_processing_glm46v_fast.py +9 -9
- transformers/models/glm46v/modeling_glm46v.py +138 -93
- transformers/models/glm46v/modular_glm46v.py +5 -3
- transformers/models/glm46v/processing_glm46v.py +7 -41
- transformers/models/glm46v/video_processing_glm46v.py +9 -11
- transformers/models/glm4_moe/configuration_glm4_moe.py +42 -35
- transformers/models/glm4_moe/modeling_glm4_moe.py +36 -39
- transformers/models/glm4_moe/modular_glm4_moe.py +43 -36
- transformers/models/glm4_moe_lite/__init__.py +28 -0
- transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +233 -0
- transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +740 -0
- transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +302 -0
- transformers/models/glm4v/configuration_glm4v.py +25 -24
- transformers/models/glm4v/image_processing_glm4v.py +39 -38
- transformers/models/glm4v/image_processing_glm4v_fast.py +8 -9
- transformers/models/glm4v/modeling_glm4v.py +249 -210
- transformers/models/glm4v/modular_glm4v.py +211 -230
- transformers/models/glm4v/processing_glm4v.py +7 -41
- transformers/models/glm4v/video_processing_glm4v.py +9 -11
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +136 -127
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +348 -356
- transformers/models/glm4v_moe/modular_glm4v_moe.py +76 -174
- transformers/models/glm_image/__init__.py +31 -0
- transformers/models/glm_image/configuration_glm_image.py +358 -0
- transformers/models/glm_image/image_processing_glm_image.py +503 -0
- transformers/models/glm_image/image_processing_glm_image_fast.py +294 -0
- transformers/models/glm_image/modeling_glm_image.py +1691 -0
- transformers/models/glm_image/modular_glm_image.py +1640 -0
- transformers/models/glm_image/processing_glm_image.py +265 -0
- transformers/models/glm_ocr/__init__.py +28 -0
- transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
- transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
- transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
- transformers/models/glmasr/__init__.py +0 -1
- transformers/models/glmasr/configuration_glmasr.py +0 -1
- transformers/models/glmasr/modeling_glmasr.py +51 -46
- transformers/models/glmasr/modular_glmasr.py +39 -29
- transformers/models/glmasr/processing_glmasr.py +7 -8
- transformers/models/glpn/configuration_glpn.py +0 -1
- transformers/models/glpn/image_processing_glpn.py +11 -12
- transformers/models/glpn/image_processing_glpn_fast.py +11 -12
- transformers/models/glpn/modeling_glpn.py +14 -14
- transformers/models/got_ocr2/configuration_got_ocr2.py +10 -13
- transformers/models/got_ocr2/image_processing_got_ocr2.py +22 -24
- transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +9 -10
- transformers/models/got_ocr2/modeling_got_ocr2.py +69 -77
- transformers/models/got_ocr2/modular_got_ocr2.py +60 -52
- transformers/models/got_ocr2/processing_got_ocr2.py +42 -63
- transformers/models/gpt2/configuration_gpt2.py +13 -2
- transformers/models/gpt2/modeling_gpt2.py +111 -113
- transformers/models/gpt2/tokenization_gpt2.py +6 -9
- transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -2
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +78 -84
- transformers/models/gpt_neo/configuration_gpt_neo.py +9 -2
- transformers/models/gpt_neo/modeling_gpt_neo.py +66 -71
- transformers/models/gpt_neox/configuration_gpt_neox.py +27 -25
- transformers/models/gpt_neox/modeling_gpt_neox.py +74 -76
- transformers/models/gpt_neox/modular_gpt_neox.py +68 -70
- transformers/models/gpt_neox/tokenization_gpt_neox.py +2 -5
- transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +24 -19
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +43 -46
- transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +1 -3
- transformers/models/gpt_oss/configuration_gpt_oss.py +31 -30
- transformers/models/gpt_oss/modeling_gpt_oss.py +80 -114
- transformers/models/gpt_oss/modular_gpt_oss.py +62 -97
- transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
- transformers/models/gptj/configuration_gptj.py +4 -5
- transformers/models/gptj/modeling_gptj.py +85 -88
- transformers/models/granite/configuration_granite.py +28 -33
- transformers/models/granite/modeling_granite.py +43 -45
- transformers/models/granite/modular_granite.py +29 -31
- transformers/models/granite_speech/configuration_granite_speech.py +0 -1
- transformers/models/granite_speech/feature_extraction_granite_speech.py +1 -3
- transformers/models/granite_speech/modeling_granite_speech.py +84 -60
- transformers/models/granite_speech/processing_granite_speech.py +11 -4
- transformers/models/granitemoe/configuration_granitemoe.py +31 -36
- transformers/models/granitemoe/modeling_granitemoe.py +39 -41
- transformers/models/granitemoe/modular_granitemoe.py +21 -23
- transformers/models/granitemoehybrid/__init__.py +0 -1
- transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +55 -48
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +82 -118
- transformers/models/granitemoehybrid/modular_granitemoehybrid.py +57 -65
- transformers/models/granitemoeshared/configuration_granitemoeshared.py +33 -37
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +52 -56
- transformers/models/granitemoeshared/modular_granitemoeshared.py +19 -21
- transformers/models/grounding_dino/configuration_grounding_dino.py +10 -46
- transformers/models/grounding_dino/image_processing_grounding_dino.py +60 -62
- transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +28 -29
- transformers/models/grounding_dino/modeling_grounding_dino.py +161 -181
- transformers/models/grounding_dino/modular_grounding_dino.py +2 -3
- transformers/models/grounding_dino/processing_grounding_dino.py +10 -38
- transformers/models/groupvit/configuration_groupvit.py +4 -2
- transformers/models/groupvit/modeling_groupvit.py +98 -92
- transformers/models/helium/configuration_helium.py +25 -29
- transformers/models/helium/modeling_helium.py +37 -40
- transformers/models/helium/modular_helium.py +3 -7
- transformers/models/herbert/tokenization_herbert.py +4 -6
- transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -5
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +12 -14
- transformers/models/hgnet_v2/modular_hgnet_v2.py +13 -17
- transformers/models/hiera/configuration_hiera.py +2 -5
- transformers/models/hiera/modeling_hiera.py +71 -70
- transformers/models/hubert/configuration_hubert.py +4 -2
- transformers/models/hubert/modeling_hubert.py +42 -41
- transformers/models/hubert/modular_hubert.py +8 -11
- transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +26 -31
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +58 -37
- transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +31 -11
- transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +31 -36
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +54 -44
- transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +27 -15
- transformers/models/ibert/configuration_ibert.py +4 -2
- transformers/models/ibert/modeling_ibert.py +60 -62
- transformers/models/ibert/quant_modules.py +0 -1
- transformers/models/idefics/configuration_idefics.py +5 -8
- transformers/models/idefics/image_processing_idefics.py +13 -15
- transformers/models/idefics/modeling_idefics.py +63 -65
- transformers/models/idefics/perceiver.py +1 -3
- transformers/models/idefics/processing_idefics.py +32 -48
- transformers/models/idefics/vision.py +27 -28
- transformers/models/idefics2/configuration_idefics2.py +1 -3
- transformers/models/idefics2/image_processing_idefics2.py +31 -32
- transformers/models/idefics2/image_processing_idefics2_fast.py +8 -8
- transformers/models/idefics2/modeling_idefics2.py +126 -106
- transformers/models/idefics2/processing_idefics2.py +10 -68
- transformers/models/idefics3/configuration_idefics3.py +1 -4
- transformers/models/idefics3/image_processing_idefics3.py +42 -43
- transformers/models/idefics3/image_processing_idefics3_fast.py +40 -15
- transformers/models/idefics3/modeling_idefics3.py +113 -92
- transformers/models/idefics3/processing_idefics3.py +15 -69
- transformers/models/ijepa/configuration_ijepa.py +0 -1
- transformers/models/ijepa/modeling_ijepa.py +13 -14
- transformers/models/ijepa/modular_ijepa.py +5 -7
- transformers/models/imagegpt/configuration_imagegpt.py +9 -2
- transformers/models/imagegpt/image_processing_imagegpt.py +17 -18
- transformers/models/imagegpt/image_processing_imagegpt_fast.py +10 -11
- transformers/models/imagegpt/modeling_imagegpt.py +65 -62
- transformers/models/informer/configuration_informer.py +6 -9
- transformers/models/informer/modeling_informer.py +87 -89
- transformers/models/informer/modular_informer.py +13 -16
- transformers/models/instructblip/configuration_instructblip.py +2 -2
- transformers/models/instructblip/modeling_instructblip.py +104 -79
- transformers/models/instructblip/processing_instructblip.py +10 -36
- transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -2
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +108 -105
- transformers/models/instructblipvideo/modular_instructblipvideo.py +73 -64
- transformers/models/instructblipvideo/processing_instructblipvideo.py +14 -33
- transformers/models/instructblipvideo/video_processing_instructblipvideo.py +6 -7
- transformers/models/internvl/configuration_internvl.py +5 -1
- transformers/models/internvl/modeling_internvl.py +76 -98
- transformers/models/internvl/modular_internvl.py +45 -59
- transformers/models/internvl/processing_internvl.py +12 -45
- transformers/models/internvl/video_processing_internvl.py +10 -11
- transformers/models/jais2/configuration_jais2.py +25 -29
- transformers/models/jais2/modeling_jais2.py +36 -38
- transformers/models/jais2/modular_jais2.py +20 -22
- transformers/models/jamba/configuration_jamba.py +5 -8
- transformers/models/jamba/modeling_jamba.py +47 -50
- transformers/models/jamba/modular_jamba.py +40 -41
- transformers/models/janus/configuration_janus.py +0 -1
- transformers/models/janus/image_processing_janus.py +37 -39
- transformers/models/janus/image_processing_janus_fast.py +20 -21
- transformers/models/janus/modeling_janus.py +103 -188
- transformers/models/janus/modular_janus.py +122 -83
- transformers/models/janus/processing_janus.py +17 -43
- transformers/models/jetmoe/configuration_jetmoe.py +26 -27
- transformers/models/jetmoe/modeling_jetmoe.py +42 -45
- transformers/models/jetmoe/modular_jetmoe.py +33 -36
- transformers/models/kosmos2/configuration_kosmos2.py +10 -9
- transformers/models/kosmos2/modeling_kosmos2.py +199 -178
- transformers/models/kosmos2/processing_kosmos2.py +40 -55
- transformers/models/kosmos2_5/__init__.py +0 -1
- transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -9
- transformers/models/kosmos2_5/image_processing_kosmos2_5.py +10 -12
- transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -11
- transformers/models/kosmos2_5/modeling_kosmos2_5.py +162 -172
- transformers/models/kosmos2_5/processing_kosmos2_5.py +8 -29
- transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +31 -28
- transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +12 -14
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +103 -106
- transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +20 -22
- transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +2 -8
- transformers/models/lasr/configuration_lasr.py +3 -7
- transformers/models/lasr/feature_extraction_lasr.py +10 -12
- transformers/models/lasr/modeling_lasr.py +21 -24
- transformers/models/lasr/modular_lasr.py +11 -13
- transformers/models/lasr/processing_lasr.py +12 -6
- transformers/models/lasr/tokenization_lasr.py +2 -4
- transformers/models/layoutlm/configuration_layoutlm.py +14 -2
- transformers/models/layoutlm/modeling_layoutlm.py +70 -72
- transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -17
- transformers/models/layoutlmv2/image_processing_layoutlmv2.py +18 -21
- transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +7 -8
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +48 -50
- transformers/models/layoutlmv2/processing_layoutlmv2.py +14 -44
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +63 -74
- transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -19
- transformers/models/layoutlmv3/image_processing_layoutlmv3.py +24 -26
- transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +9 -10
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +49 -51
- transformers/models/layoutlmv3/processing_layoutlmv3.py +14 -46
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +64 -75
- transformers/models/layoutxlm/configuration_layoutxlm.py +14 -17
- transformers/models/layoutxlm/modular_layoutxlm.py +0 -1
- transformers/models/layoutxlm/processing_layoutxlm.py +14 -44
- transformers/models/layoutxlm/tokenization_layoutxlm.py +65 -76
- transformers/models/led/configuration_led.py +8 -12
- transformers/models/led/modeling_led.py +113 -267
- transformers/models/levit/configuration_levit.py +0 -1
- transformers/models/levit/image_processing_levit.py +19 -21
- transformers/models/levit/image_processing_levit_fast.py +4 -5
- transformers/models/levit/modeling_levit.py +17 -19
- transformers/models/lfm2/configuration_lfm2.py +27 -30
- transformers/models/lfm2/modeling_lfm2.py +46 -48
- transformers/models/lfm2/modular_lfm2.py +32 -32
- transformers/models/lfm2_moe/__init__.py +0 -1
- transformers/models/lfm2_moe/configuration_lfm2_moe.py +6 -9
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +48 -49
- transformers/models/lfm2_moe/modular_lfm2_moe.py +8 -9
- transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -1
- transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +43 -20
- transformers/models/lfm2_vl/modeling_lfm2_vl.py +73 -61
- transformers/models/lfm2_vl/modular_lfm2_vl.py +66 -54
- transformers/models/lfm2_vl/processing_lfm2_vl.py +14 -34
- transformers/models/lightglue/image_processing_lightglue.py +16 -15
- transformers/models/lightglue/image_processing_lightglue_fast.py +8 -7
- transformers/models/lightglue/modeling_lightglue.py +31 -33
- transformers/models/lightglue/modular_lightglue.py +31 -31
- transformers/models/lighton_ocr/__init__.py +28 -0
- transformers/models/lighton_ocr/configuration_lighton_ocr.py +128 -0
- transformers/models/lighton_ocr/modeling_lighton_ocr.py +463 -0
- transformers/models/lighton_ocr/modular_lighton_ocr.py +404 -0
- transformers/models/lighton_ocr/processing_lighton_ocr.py +229 -0
- transformers/models/lilt/configuration_lilt.py +6 -2
- transformers/models/lilt/modeling_lilt.py +53 -55
- transformers/models/llama/configuration_llama.py +26 -31
- transformers/models/llama/modeling_llama.py +35 -38
- transformers/models/llama/tokenization_llama.py +2 -4
- transformers/models/llama4/configuration_llama4.py +87 -69
- transformers/models/llama4/image_processing_llama4_fast.py +11 -12
- transformers/models/llama4/modeling_llama4.py +116 -115
- transformers/models/llama4/processing_llama4.py +33 -57
- transformers/models/llava/configuration_llava.py +10 -1
- transformers/models/llava/image_processing_llava.py +25 -28
- transformers/models/llava/image_processing_llava_fast.py +9 -10
- transformers/models/llava/modeling_llava.py +73 -102
- transformers/models/llava/processing_llava.py +18 -51
- transformers/models/llava_next/configuration_llava_next.py +2 -2
- transformers/models/llava_next/image_processing_llava_next.py +43 -45
- transformers/models/llava_next/image_processing_llava_next_fast.py +11 -12
- transformers/models/llava_next/modeling_llava_next.py +103 -104
- transformers/models/llava_next/processing_llava_next.py +18 -47
- transformers/models/llava_next_video/configuration_llava_next_video.py +10 -7
- transformers/models/llava_next_video/modeling_llava_next_video.py +168 -155
- transformers/models/llava_next_video/modular_llava_next_video.py +154 -147
- transformers/models/llava_next_video/processing_llava_next_video.py +21 -63
- transformers/models/llava_next_video/video_processing_llava_next_video.py +0 -1
- transformers/models/llava_onevision/configuration_llava_onevision.py +10 -7
- transformers/models/llava_onevision/image_processing_llava_onevision.py +40 -42
- transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +14 -14
- transformers/models/llava_onevision/modeling_llava_onevision.py +170 -166
- transformers/models/llava_onevision/modular_llava_onevision.py +156 -152
- transformers/models/llava_onevision/processing_llava_onevision.py +21 -53
- transformers/models/llava_onevision/video_processing_llava_onevision.py +0 -1
- transformers/models/longcat_flash/__init__.py +0 -1
- transformers/models/longcat_flash/configuration_longcat_flash.py +39 -45
- transformers/models/longcat_flash/modeling_longcat_flash.py +37 -38
- transformers/models/longcat_flash/modular_longcat_flash.py +23 -24
- transformers/models/longformer/configuration_longformer.py +5 -5
- transformers/models/longformer/modeling_longformer.py +99 -101
- transformers/models/longt5/configuration_longt5.py +9 -7
- transformers/models/longt5/modeling_longt5.py +45 -45
- transformers/models/luke/configuration_luke.py +8 -2
- transformers/models/luke/modeling_luke.py +179 -181
- transformers/models/luke/tokenization_luke.py +99 -105
- transformers/{pipelines/deprecated → models/lw_detr}/__init__.py +14 -3
- transformers/models/lw_detr/configuration_lw_detr.py +362 -0
- transformers/models/lw_detr/modeling_lw_detr.py +1697 -0
- transformers/models/lw_detr/modular_lw_detr.py +1609 -0
- transformers/models/lxmert/configuration_lxmert.py +16 -1
- transformers/models/lxmert/modeling_lxmert.py +63 -74
- transformers/models/m2m_100/configuration_m2m_100.py +7 -9
- transformers/models/m2m_100/modeling_m2m_100.py +72 -74
- transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
- transformers/models/mamba/configuration_mamba.py +5 -3
- transformers/models/mamba/modeling_mamba.py +61 -70
- transformers/models/mamba2/configuration_mamba2.py +5 -8
- transformers/models/mamba2/modeling_mamba2.py +66 -79
- transformers/models/marian/configuration_marian.py +10 -5
- transformers/models/marian/modeling_marian.py +88 -90
- transformers/models/marian/tokenization_marian.py +6 -6
- transformers/models/markuplm/configuration_markuplm.py +4 -7
- transformers/models/markuplm/feature_extraction_markuplm.py +1 -2
- transformers/models/markuplm/modeling_markuplm.py +63 -65
- transformers/models/markuplm/processing_markuplm.py +31 -38
- transformers/models/markuplm/tokenization_markuplm.py +67 -77
- transformers/models/mask2former/configuration_mask2former.py +14 -52
- transformers/models/mask2former/image_processing_mask2former.py +84 -85
- transformers/models/mask2former/image_processing_mask2former_fast.py +36 -36
- transformers/models/mask2former/modeling_mask2former.py +108 -104
- transformers/models/mask2former/modular_mask2former.py +6 -8
- transformers/models/maskformer/configuration_maskformer.py +17 -51
- transformers/models/maskformer/configuration_maskformer_swin.py +2 -5
- transformers/models/maskformer/image_processing_maskformer.py +84 -85
- transformers/models/maskformer/image_processing_maskformer_fast.py +35 -36
- transformers/models/maskformer/modeling_maskformer.py +71 -67
- transformers/models/maskformer/modeling_maskformer_swin.py +20 -23
- transformers/models/mbart/configuration_mbart.py +9 -5
- transformers/models/mbart/modeling_mbart.py +120 -119
- transformers/models/mbart/tokenization_mbart.py +2 -4
- transformers/models/mbart50/tokenization_mbart50.py +3 -5
- transformers/models/megatron_bert/configuration_megatron_bert.py +13 -3
- transformers/models/megatron_bert/modeling_megatron_bert.py +139 -165
- transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
- transformers/models/metaclip_2/modeling_metaclip_2.py +94 -87
- transformers/models/metaclip_2/modular_metaclip_2.py +59 -45
- transformers/models/mgp_str/configuration_mgp_str.py +0 -1
- transformers/models/mgp_str/modeling_mgp_str.py +18 -18
- transformers/models/mgp_str/processing_mgp_str.py +3 -20
- transformers/models/mgp_str/tokenization_mgp_str.py +1 -3
- transformers/models/mimi/configuration_mimi.py +42 -40
- transformers/models/mimi/modeling_mimi.py +116 -115
- transformers/models/minimax/__init__.py +0 -1
- transformers/models/minimax/configuration_minimax.py +40 -47
- transformers/models/minimax/modeling_minimax.py +46 -49
- transformers/models/minimax/modular_minimax.py +59 -65
- transformers/models/minimax_m2/__init__.py +28 -0
- transformers/models/minimax_m2/configuration_minimax_m2.py +188 -0
- transformers/models/minimax_m2/modeling_minimax_m2.py +704 -0
- transformers/models/minimax_m2/modular_minimax_m2.py +346 -0
- transformers/models/ministral/configuration_ministral.py +25 -29
- transformers/models/ministral/modeling_ministral.py +35 -37
- transformers/models/ministral/modular_ministral.py +32 -37
- transformers/models/ministral3/configuration_ministral3.py +23 -26
- transformers/models/ministral3/modeling_ministral3.py +35 -37
- transformers/models/ministral3/modular_ministral3.py +7 -8
- transformers/models/mistral/configuration_mistral.py +24 -29
- transformers/models/mistral/modeling_mistral.py +35 -37
- transformers/models/mistral/modular_mistral.py +14 -15
- transformers/models/mistral3/configuration_mistral3.py +4 -1
- transformers/models/mistral3/modeling_mistral3.py +79 -82
- transformers/models/mistral3/modular_mistral3.py +66 -67
- transformers/models/mixtral/configuration_mixtral.py +32 -38
- transformers/models/mixtral/modeling_mixtral.py +39 -42
- transformers/models/mixtral/modular_mixtral.py +26 -29
- transformers/models/mlcd/configuration_mlcd.py +0 -1
- transformers/models/mlcd/modeling_mlcd.py +17 -17
- transformers/models/mlcd/modular_mlcd.py +16 -16
- transformers/models/mllama/configuration_mllama.py +10 -15
- transformers/models/mllama/image_processing_mllama.py +23 -25
- transformers/models/mllama/image_processing_mllama_fast.py +11 -11
- transformers/models/mllama/modeling_mllama.py +100 -103
- transformers/models/mllama/processing_mllama.py +6 -55
- transformers/models/mluke/tokenization_mluke.py +97 -103
- transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -46
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +159 -179
- transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -46
- transformers/models/mobilebert/configuration_mobilebert.py +4 -2
- transformers/models/mobilebert/modeling_mobilebert.py +78 -88
- transformers/models/mobilebert/tokenization_mobilebert.py +0 -1
- transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +0 -1
- transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +20 -23
- transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +0 -1
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +13 -16
- transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +0 -1
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +48 -51
- transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +14 -15
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +21 -22
- transformers/models/mobilevit/configuration_mobilevit.py +0 -1
- transformers/models/mobilevit/image_processing_mobilevit.py +41 -44
- transformers/models/mobilevit/image_processing_mobilevit_fast.py +12 -13
- transformers/models/mobilevit/modeling_mobilevit.py +21 -21
- transformers/models/mobilevitv2/configuration_mobilevitv2.py +0 -1
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +21 -22
- transformers/models/modernbert/configuration_modernbert.py +76 -51
- transformers/models/modernbert/modeling_modernbert.py +188 -943
- transformers/models/modernbert/modular_modernbert.py +255 -978
- transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +50 -44
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +54 -64
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +92 -92
- transformers/models/moonshine/configuration_moonshine.py +34 -31
- transformers/models/moonshine/modeling_moonshine.py +70 -72
- transformers/models/moonshine/modular_moonshine.py +91 -86
- transformers/models/moshi/configuration_moshi.py +46 -23
- transformers/models/moshi/modeling_moshi.py +134 -142
- transformers/models/mpnet/configuration_mpnet.py +6 -2
- transformers/models/mpnet/modeling_mpnet.py +55 -57
- transformers/models/mpnet/tokenization_mpnet.py +1 -4
- transformers/models/mpt/configuration_mpt.py +17 -9
- transformers/models/mpt/modeling_mpt.py +58 -60
- transformers/models/mra/configuration_mra.py +8 -2
- transformers/models/mra/modeling_mra.py +54 -56
- transformers/models/mt5/configuration_mt5.py +9 -6
- transformers/models/mt5/modeling_mt5.py +80 -85
- transformers/models/musicgen/configuration_musicgen.py +12 -8
- transformers/models/musicgen/modeling_musicgen.py +114 -116
- transformers/models/musicgen/processing_musicgen.py +3 -21
- transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -8
- transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +8 -9
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +113 -126
- transformers/models/musicgen_melody/processing_musicgen_melody.py +3 -22
- transformers/models/mvp/configuration_mvp.py +8 -5
- transformers/models/mvp/modeling_mvp.py +121 -123
- transformers/models/myt5/tokenization_myt5.py +8 -10
- transformers/models/nanochat/configuration_nanochat.py +5 -8
- transformers/models/nanochat/modeling_nanochat.py +36 -39
- transformers/models/nanochat/modular_nanochat.py +16 -18
- transformers/models/nemotron/configuration_nemotron.py +25 -30
- transformers/models/nemotron/modeling_nemotron.py +53 -66
- transformers/models/nllb/tokenization_nllb.py +14 -14
- transformers/models/nllb_moe/configuration_nllb_moe.py +7 -10
- transformers/models/nllb_moe/modeling_nllb_moe.py +70 -72
- transformers/models/nougat/image_processing_nougat.py +29 -32
- transformers/models/nougat/image_processing_nougat_fast.py +12 -13
- transformers/models/nougat/processing_nougat.py +37 -39
- transformers/models/nougat/tokenization_nougat.py +5 -7
- transformers/models/nystromformer/configuration_nystromformer.py +8 -2
- transformers/models/nystromformer/modeling_nystromformer.py +61 -63
- transformers/models/olmo/configuration_olmo.py +23 -28
- transformers/models/olmo/modeling_olmo.py +35 -38
- transformers/models/olmo/modular_olmo.py +8 -12
- transformers/models/olmo2/configuration_olmo2.py +27 -32
- transformers/models/olmo2/modeling_olmo2.py +36 -39
- transformers/models/olmo2/modular_olmo2.py +36 -38
- transformers/models/olmo3/__init__.py +0 -1
- transformers/models/olmo3/configuration_olmo3.py +30 -34
- transformers/models/olmo3/modeling_olmo3.py +35 -38
- transformers/models/olmo3/modular_olmo3.py +44 -47
- transformers/models/olmoe/configuration_olmoe.py +29 -33
- transformers/models/olmoe/modeling_olmoe.py +41 -43
- transformers/models/olmoe/modular_olmoe.py +15 -16
- transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -50
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +59 -57
- transformers/models/omdet_turbo/processing_omdet_turbo.py +19 -67
- transformers/models/oneformer/configuration_oneformer.py +11 -51
- transformers/models/oneformer/image_processing_oneformer.py +83 -84
- transformers/models/oneformer/image_processing_oneformer_fast.py +41 -42
- transformers/models/oneformer/modeling_oneformer.py +137 -133
- transformers/models/oneformer/processing_oneformer.py +28 -43
- transformers/models/openai/configuration_openai.py +16 -1
- transformers/models/openai/modeling_openai.py +50 -51
- transformers/models/openai/tokenization_openai.py +2 -5
- transformers/models/opt/configuration_opt.py +6 -7
- transformers/models/opt/modeling_opt.py +79 -80
- transformers/models/ovis2/__init__.py +0 -1
- transformers/models/ovis2/configuration_ovis2.py +4 -1
- transformers/models/ovis2/image_processing_ovis2.py +22 -24
- transformers/models/ovis2/image_processing_ovis2_fast.py +9 -10
- transformers/models/ovis2/modeling_ovis2.py +99 -142
- transformers/models/ovis2/modular_ovis2.py +82 -45
- transformers/models/ovis2/processing_ovis2.py +12 -40
- transformers/models/owlv2/configuration_owlv2.py +4 -2
- transformers/models/owlv2/image_processing_owlv2.py +20 -21
- transformers/models/owlv2/image_processing_owlv2_fast.py +12 -13
- transformers/models/owlv2/modeling_owlv2.py +122 -114
- transformers/models/owlv2/modular_owlv2.py +11 -12
- transformers/models/owlv2/processing_owlv2.py +20 -49
- transformers/models/owlvit/configuration_owlvit.py +4 -2
- transformers/models/owlvit/image_processing_owlvit.py +21 -22
- transformers/models/owlvit/image_processing_owlvit_fast.py +2 -3
- transformers/models/owlvit/modeling_owlvit.py +121 -113
- transformers/models/owlvit/processing_owlvit.py +20 -48
- transformers/models/paddleocr_vl/__init__.py +0 -1
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +28 -29
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +34 -35
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +12 -12
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +159 -158
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +148 -119
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +1 -3
- transformers/models/paligemma/configuration_paligemma.py +4 -1
- transformers/models/paligemma/modeling_paligemma.py +81 -79
- transformers/models/paligemma/processing_paligemma.py +13 -66
- transformers/models/parakeet/configuration_parakeet.py +3 -8
- transformers/models/parakeet/feature_extraction_parakeet.py +10 -12
- transformers/models/parakeet/modeling_parakeet.py +21 -25
- transformers/models/parakeet/modular_parakeet.py +19 -21
- transformers/models/parakeet/processing_parakeet.py +12 -5
- transformers/models/parakeet/tokenization_parakeet.py +2 -4
- transformers/models/patchtsmixer/configuration_patchtsmixer.py +5 -8
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +63 -65
- transformers/models/patchtst/configuration_patchtst.py +6 -9
- transformers/models/patchtst/modeling_patchtst.py +75 -77
- transformers/models/pe_audio/__init__.py +0 -1
- transformers/models/pe_audio/configuration_pe_audio.py +14 -16
- transformers/models/pe_audio/feature_extraction_pe_audio.py +6 -8
- transformers/models/pe_audio/modeling_pe_audio.py +30 -31
- transformers/models/pe_audio/modular_pe_audio.py +17 -18
- transformers/models/pe_audio/processing_pe_audio.py +0 -1
- transformers/models/pe_audio_video/__init__.py +0 -1
- transformers/models/pe_audio_video/configuration_pe_audio_video.py +15 -17
- transformers/models/pe_audio_video/modeling_pe_audio_video.py +64 -65
- transformers/models/pe_audio_video/modular_pe_audio_video.py +56 -57
- transformers/models/pe_audio_video/processing_pe_audio_video.py +0 -1
- transformers/models/pe_video/__init__.py +0 -1
- transformers/models/pe_video/configuration_pe_video.py +14 -16
- transformers/models/pe_video/modeling_pe_video.py +57 -46
- transformers/models/pe_video/modular_pe_video.py +47 -35
- transformers/models/pe_video/video_processing_pe_video.py +2 -4
- transformers/models/pegasus/configuration_pegasus.py +8 -6
- transformers/models/pegasus/modeling_pegasus.py +67 -69
- transformers/models/pegasus/tokenization_pegasus.py +1 -4
- transformers/models/pegasus_x/configuration_pegasus_x.py +5 -4
- transformers/models/pegasus_x/modeling_pegasus_x.py +53 -55
- transformers/models/perceiver/configuration_perceiver.py +0 -1
- transformers/models/perceiver/image_processing_perceiver.py +22 -25
- transformers/models/perceiver/image_processing_perceiver_fast.py +7 -8
- transformers/models/perceiver/modeling_perceiver.py +152 -145
- transformers/models/perceiver/tokenization_perceiver.py +3 -6
- transformers/models/perception_lm/configuration_perception_lm.py +0 -1
- transformers/models/perception_lm/image_processing_perception_lm_fast.py +8 -9
- transformers/models/perception_lm/modeling_perception_lm.py +64 -67
- transformers/models/perception_lm/modular_perception_lm.py +58 -58
- transformers/models/perception_lm/processing_perception_lm.py +13 -47
- transformers/models/perception_lm/video_processing_perception_lm.py +0 -1
- transformers/models/persimmon/configuration_persimmon.py +23 -28
- transformers/models/persimmon/modeling_persimmon.py +44 -47
- transformers/models/phi/configuration_phi.py +27 -28
- transformers/models/phi/modeling_phi.py +39 -41
- transformers/models/phi/modular_phi.py +26 -26
- transformers/models/phi3/configuration_phi3.py +32 -37
- transformers/models/phi3/modeling_phi3.py +37 -40
- transformers/models/phi3/modular_phi3.py +16 -20
- transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +36 -39
- transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +7 -9
- transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +11 -11
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +100 -117
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +103 -90
- transformers/models/phi4_multimodal/processing_phi4_multimodal.py +7 -42
- transformers/models/phimoe/configuration_phimoe.py +31 -36
- transformers/models/phimoe/modeling_phimoe.py +50 -77
- transformers/models/phimoe/modular_phimoe.py +12 -8
- transformers/models/phobert/tokenization_phobert.py +4 -6
- transformers/models/pix2struct/configuration_pix2struct.py +12 -10
- transformers/models/pix2struct/image_processing_pix2struct.py +15 -19
- transformers/models/pix2struct/image_processing_pix2struct_fast.py +12 -15
- transformers/models/pix2struct/modeling_pix2struct.py +56 -52
- transformers/models/pix2struct/processing_pix2struct.py +5 -26
- transformers/models/pixio/__init__.py +0 -1
- transformers/models/pixio/configuration_pixio.py +2 -5
- transformers/models/pixio/modeling_pixio.py +16 -17
- transformers/models/pixio/modular_pixio.py +7 -8
- transformers/models/pixtral/configuration_pixtral.py +11 -14
- transformers/models/pixtral/image_processing_pixtral.py +26 -28
- transformers/models/pixtral/image_processing_pixtral_fast.py +10 -11
- transformers/models/pixtral/modeling_pixtral.py +31 -37
- transformers/models/pixtral/processing_pixtral.py +18 -52
- transformers/models/plbart/configuration_plbart.py +8 -6
- transformers/models/plbart/modeling_plbart.py +109 -109
- transformers/models/plbart/modular_plbart.py +31 -33
- transformers/models/plbart/tokenization_plbart.py +4 -5
- transformers/models/poolformer/configuration_poolformer.py +0 -1
- transformers/models/poolformer/image_processing_poolformer.py +21 -24
- transformers/models/poolformer/image_processing_poolformer_fast.py +13 -14
- transformers/models/poolformer/modeling_poolformer.py +10 -12
- transformers/models/pop2piano/configuration_pop2piano.py +7 -7
- transformers/models/pop2piano/feature_extraction_pop2piano.py +6 -9
- transformers/models/pop2piano/modeling_pop2piano.py +24 -24
- transformers/models/pop2piano/processing_pop2piano.py +25 -33
- transformers/models/pop2piano/tokenization_pop2piano.py +15 -23
- transformers/models/pp_doclayout_v3/__init__.py +30 -0
- transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
- transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
- transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
- transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
- transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +13 -46
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
- transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +20 -21
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +17 -16
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +21 -20
- transformers/models/prophetnet/configuration_prophetnet.py +37 -38
- transformers/models/prophetnet/modeling_prophetnet.py +121 -153
- transformers/models/prophetnet/tokenization_prophetnet.py +14 -16
- transformers/models/pvt/configuration_pvt.py +0 -1
- transformers/models/pvt/image_processing_pvt.py +24 -27
- transformers/models/pvt/image_processing_pvt_fast.py +1 -2
- transformers/models/pvt/modeling_pvt.py +19 -21
- transformers/models/pvt_v2/configuration_pvt_v2.py +4 -8
- transformers/models/pvt_v2/modeling_pvt_v2.py +27 -28
- transformers/models/qwen2/configuration_qwen2.py +32 -25
- transformers/models/qwen2/modeling_qwen2.py +35 -37
- transformers/models/qwen2/modular_qwen2.py +14 -15
- transformers/models/qwen2/tokenization_qwen2.py +2 -9
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +36 -27
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +241 -214
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +228 -193
- transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +41 -49
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +28 -34
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +188 -145
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +64 -91
- transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +7 -43
- transformers/models/qwen2_audio/configuration_qwen2_audio.py +0 -1
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +39 -41
- transformers/models/qwen2_audio/processing_qwen2_audio.py +13 -42
- transformers/models/qwen2_moe/configuration_qwen2_moe.py +42 -35
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +40 -43
- transformers/models/qwen2_moe/modular_qwen2_moe.py +10 -13
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +28 -33
- transformers/models/qwen2_vl/image_processing_qwen2_vl.py +38 -40
- transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +12 -15
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +184 -141
- transformers/models/qwen2_vl/processing_qwen2_vl.py +7 -44
- transformers/models/qwen2_vl/video_processing_qwen2_vl.py +38 -18
- transformers/models/qwen3/configuration_qwen3.py +34 -27
- transformers/models/qwen3/modeling_qwen3.py +35 -38
- transformers/models/qwen3/modular_qwen3.py +7 -9
- transformers/models/qwen3_moe/configuration_qwen3_moe.py +45 -35
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +40 -43
- transformers/models/qwen3_moe/modular_qwen3_moe.py +10 -13
- transformers/models/qwen3_next/configuration_qwen3_next.py +47 -38
- transformers/models/qwen3_next/modeling_qwen3_next.py +44 -47
- transformers/models/qwen3_next/modular_qwen3_next.py +37 -38
- transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +139 -106
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +266 -206
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +228 -181
- transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +40 -48
- transformers/models/qwen3_vl/configuration_qwen3_vl.py +22 -24
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +185 -122
- transformers/models/qwen3_vl/modular_qwen3_vl.py +153 -139
- transformers/models/qwen3_vl/processing_qwen3_vl.py +6 -42
- transformers/models/qwen3_vl/video_processing_qwen3_vl.py +10 -12
- transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +27 -30
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +249 -178
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +55 -42
- transformers/models/rag/configuration_rag.py +6 -7
- transformers/models/rag/modeling_rag.py +119 -121
- transformers/models/rag/retrieval_rag.py +3 -5
- transformers/models/rag/tokenization_rag.py +0 -50
- transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +29 -30
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +35 -39
- transformers/models/reformer/configuration_reformer.py +7 -8
- transformers/models/reformer/modeling_reformer.py +67 -68
- transformers/models/reformer/tokenization_reformer.py +3 -6
- transformers/models/regnet/configuration_regnet.py +0 -1
- transformers/models/regnet/modeling_regnet.py +7 -9
- transformers/models/rembert/configuration_rembert.py +8 -2
- transformers/models/rembert/modeling_rembert.py +108 -132
- transformers/models/rembert/tokenization_rembert.py +1 -4
- transformers/models/resnet/configuration_resnet.py +2 -5
- transformers/models/resnet/modeling_resnet.py +14 -15
- transformers/models/roberta/configuration_roberta.py +11 -3
- transformers/models/roberta/modeling_roberta.py +97 -99
- transformers/models/roberta/modular_roberta.py +55 -58
- transformers/models/roberta/tokenization_roberta.py +2 -5
- transformers/models/roberta/tokenization_roberta_old.py +2 -4
- transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -3
- transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +97 -99
- transformers/models/roc_bert/configuration_roc_bert.py +8 -2
- transformers/models/roc_bert/modeling_roc_bert.py +125 -162
- transformers/models/roc_bert/tokenization_roc_bert.py +88 -94
- transformers/models/roformer/configuration_roformer.py +13 -3
- transformers/models/roformer/modeling_roformer.py +79 -95
- transformers/models/roformer/tokenization_roformer.py +3 -6
- transformers/models/roformer/tokenization_utils.py +0 -1
- transformers/models/rt_detr/configuration_rt_detr.py +8 -50
- transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -5
- transformers/models/rt_detr/image_processing_rt_detr.py +54 -55
- transformers/models/rt_detr/image_processing_rt_detr_fast.py +39 -26
- transformers/models/rt_detr/modeling_rt_detr.py +643 -804
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +4 -7
- transformers/models/rt_detr/modular_rt_detr.py +1522 -20
- transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -58
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +384 -521
- transformers/models/rt_detr_v2/modular_rt_detr_v2.py +27 -70
- transformers/models/rwkv/configuration_rwkv.py +2 -4
- transformers/models/rwkv/modeling_rwkv.py +29 -54
- transformers/models/sam/configuration_sam.py +2 -1
- transformers/models/sam/image_processing_sam.py +59 -60
- transformers/models/sam/image_processing_sam_fast.py +25 -26
- transformers/models/sam/modeling_sam.py +46 -43
- transformers/models/sam/processing_sam.py +39 -27
- transformers/models/sam2/configuration_sam2.py +1 -2
- transformers/models/sam2/image_processing_sam2_fast.py +14 -15
- transformers/models/sam2/modeling_sam2.py +96 -94
- transformers/models/sam2/modular_sam2.py +85 -94
- transformers/models/sam2/processing_sam2.py +31 -47
- transformers/models/sam2_video/configuration_sam2_video.py +0 -1
- transformers/models/sam2_video/modeling_sam2_video.py +114 -116
- transformers/models/sam2_video/modular_sam2_video.py +72 -89
- transformers/models/sam2_video/processing_sam2_video.py +49 -66
- transformers/models/sam2_video/video_processing_sam2_video.py +1 -4
- transformers/models/sam3/configuration_sam3.py +0 -1
- transformers/models/sam3/image_processing_sam3_fast.py +17 -20
- transformers/models/sam3/modeling_sam3.py +94 -100
- transformers/models/sam3/modular_sam3.py +3 -8
- transformers/models/sam3/processing_sam3.py +37 -52
- transformers/models/sam3_tracker/__init__.py +0 -1
- transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -3
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +79 -80
- transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -2
- transformers/models/sam3_tracker/processing_sam3_tracker.py +31 -48
- transformers/models/sam3_tracker_video/__init__.py +0 -1
- transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +0 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +115 -114
- transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -24
- transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +50 -66
- transformers/models/sam3_video/configuration_sam3_video.py +0 -1
- transformers/models/sam3_video/modeling_sam3_video.py +56 -45
- transformers/models/sam3_video/processing_sam3_video.py +25 -45
- transformers/models/sam_hq/__init__.py +1 -1
- transformers/models/sam_hq/configuration_sam_hq.py +2 -1
- transformers/models/sam_hq/modeling_sam_hq.py +52 -50
- transformers/models/sam_hq/modular_sam_hq.py +23 -25
- transformers/models/sam_hq/{processing_samhq.py → processing_sam_hq.py} +41 -29
- transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -10
- transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +8 -11
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +180 -182
- transformers/models/seamless_m4t/processing_seamless_m4t.py +18 -39
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +15 -20
- transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -10
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +193 -195
- transformers/models/seed_oss/configuration_seed_oss.py +30 -34
- transformers/models/seed_oss/modeling_seed_oss.py +34 -36
- transformers/models/seed_oss/modular_seed_oss.py +6 -7
- transformers/models/segformer/configuration_segformer.py +0 -10
- transformers/models/segformer/image_processing_segformer.py +39 -42
- transformers/models/segformer/image_processing_segformer_fast.py +11 -12
- transformers/models/segformer/modeling_segformer.py +28 -28
- transformers/models/segformer/modular_segformer.py +8 -9
- transformers/models/seggpt/configuration_seggpt.py +0 -1
- transformers/models/seggpt/image_processing_seggpt.py +38 -41
- transformers/models/seggpt/modeling_seggpt.py +48 -38
- transformers/models/sew/configuration_sew.py +4 -2
- transformers/models/sew/modeling_sew.py +42 -40
- transformers/models/sew/modular_sew.py +12 -13
- transformers/models/sew_d/configuration_sew_d.py +4 -2
- transformers/models/sew_d/modeling_sew_d.py +32 -31
- transformers/models/shieldgemma2/configuration_shieldgemma2.py +0 -1
- transformers/models/shieldgemma2/modeling_shieldgemma2.py +19 -21
- transformers/models/shieldgemma2/processing_shieldgemma2.py +3 -5
- transformers/models/siglip/configuration_siglip.py +4 -2
- transformers/models/siglip/image_processing_siglip.py +17 -20
- transformers/models/siglip/image_processing_siglip_fast.py +0 -1
- transformers/models/siglip/modeling_siglip.py +65 -110
- transformers/models/siglip/processing_siglip.py +2 -14
- transformers/models/siglip/tokenization_siglip.py +6 -7
- transformers/models/siglip2/__init__.py +1 -0
- transformers/models/siglip2/configuration_siglip2.py +4 -2
- transformers/models/siglip2/image_processing_siglip2.py +15 -16
- transformers/models/siglip2/image_processing_siglip2_fast.py +6 -7
- transformers/models/siglip2/modeling_siglip2.py +89 -130
- transformers/models/siglip2/modular_siglip2.py +95 -48
- transformers/models/siglip2/processing_siglip2.py +2 -14
- transformers/models/siglip2/tokenization_siglip2.py +95 -0
- transformers/models/smollm3/configuration_smollm3.py +29 -32
- transformers/models/smollm3/modeling_smollm3.py +35 -38
- transformers/models/smollm3/modular_smollm3.py +36 -38
- transformers/models/smolvlm/configuration_smolvlm.py +2 -4
- transformers/models/smolvlm/image_processing_smolvlm.py +42 -43
- transformers/models/smolvlm/image_processing_smolvlm_fast.py +41 -15
- transformers/models/smolvlm/modeling_smolvlm.py +124 -96
- transformers/models/smolvlm/modular_smolvlm.py +50 -39
- transformers/models/smolvlm/processing_smolvlm.py +15 -76
- transformers/models/smolvlm/video_processing_smolvlm.py +16 -17
- transformers/models/solar_open/__init__.py +27 -0
- transformers/models/solar_open/configuration_solar_open.py +184 -0
- transformers/models/solar_open/modeling_solar_open.py +642 -0
- transformers/models/solar_open/modular_solar_open.py +224 -0
- transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +0 -1
- transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +26 -27
- transformers/models/speech_to_text/configuration_speech_to_text.py +9 -9
- transformers/models/speech_to_text/feature_extraction_speech_to_text.py +10 -13
- transformers/models/speech_to_text/modeling_speech_to_text.py +55 -57
- transformers/models/speech_to_text/processing_speech_to_text.py +4 -30
- transformers/models/speech_to_text/tokenization_speech_to_text.py +5 -6
- transformers/models/speecht5/configuration_speecht5.py +7 -9
- transformers/models/speecht5/feature_extraction_speecht5.py +16 -37
- transformers/models/speecht5/modeling_speecht5.py +172 -174
- transformers/models/speecht5/number_normalizer.py +0 -1
- transformers/models/speecht5/processing_speecht5.py +3 -37
- transformers/models/speecht5/tokenization_speecht5.py +4 -5
- transformers/models/splinter/configuration_splinter.py +6 -7
- transformers/models/splinter/modeling_splinter.py +62 -59
- transformers/models/splinter/tokenization_splinter.py +2 -4
- transformers/models/squeezebert/configuration_squeezebert.py +14 -2
- transformers/models/squeezebert/modeling_squeezebert.py +60 -62
- transformers/models/squeezebert/tokenization_squeezebert.py +0 -1
- transformers/models/stablelm/configuration_stablelm.py +28 -29
- transformers/models/stablelm/modeling_stablelm.py +44 -47
- transformers/models/starcoder2/configuration_starcoder2.py +30 -27
- transformers/models/starcoder2/modeling_starcoder2.py +38 -41
- transformers/models/starcoder2/modular_starcoder2.py +17 -19
- transformers/models/superglue/configuration_superglue.py +7 -3
- transformers/models/superglue/image_processing_superglue.py +15 -15
- transformers/models/superglue/image_processing_superglue_fast.py +8 -8
- transformers/models/superglue/modeling_superglue.py +41 -37
- transformers/models/superpoint/image_processing_superpoint.py +15 -15
- transformers/models/superpoint/image_processing_superpoint_fast.py +7 -9
- transformers/models/superpoint/modeling_superpoint.py +17 -16
- transformers/models/swiftformer/configuration_swiftformer.py +0 -1
- transformers/models/swiftformer/modeling_swiftformer.py +12 -14
- transformers/models/swin/configuration_swin.py +2 -5
- transformers/models/swin/modeling_swin.py +69 -78
- transformers/models/swin2sr/configuration_swin2sr.py +0 -1
- transformers/models/swin2sr/image_processing_swin2sr.py +10 -13
- transformers/models/swin2sr/image_processing_swin2sr_fast.py +4 -7
- transformers/models/swin2sr/modeling_swin2sr.py +30 -30
- transformers/models/swinv2/configuration_swinv2.py +2 -5
- transformers/models/swinv2/modeling_swinv2.py +65 -74
- transformers/models/switch_transformers/configuration_switch_transformers.py +11 -7
- transformers/models/switch_transformers/modeling_switch_transformers.py +35 -36
- transformers/models/switch_transformers/modular_switch_transformers.py +32 -33
- transformers/models/t5/configuration_t5.py +9 -9
- transformers/models/t5/modeling_t5.py +80 -85
- transformers/models/t5/tokenization_t5.py +1 -3
- transformers/models/t5gemma/configuration_t5gemma.py +43 -59
- transformers/models/t5gemma/modeling_t5gemma.py +105 -108
- transformers/models/t5gemma/modular_t5gemma.py +128 -142
- transformers/models/t5gemma2/configuration_t5gemma2.py +86 -100
- transformers/models/t5gemma2/modeling_t5gemma2.py +234 -194
- transformers/models/t5gemma2/modular_t5gemma2.py +279 -264
- transformers/models/table_transformer/configuration_table_transformer.py +18 -50
- transformers/models/table_transformer/modeling_table_transformer.py +73 -101
- transformers/models/tapas/configuration_tapas.py +12 -2
- transformers/models/tapas/modeling_tapas.py +65 -67
- transformers/models/tapas/tokenization_tapas.py +116 -153
- transformers/models/textnet/configuration_textnet.py +4 -7
- transformers/models/textnet/image_processing_textnet.py +22 -25
- transformers/models/textnet/image_processing_textnet_fast.py +8 -9
- transformers/models/textnet/modeling_textnet.py +28 -28
- transformers/models/time_series_transformer/configuration_time_series_transformer.py +5 -8
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +82 -84
- transformers/models/timesfm/configuration_timesfm.py +0 -1
- transformers/models/timesfm/modeling_timesfm.py +22 -25
- transformers/models/timesfm/modular_timesfm.py +21 -24
- transformers/models/timesformer/configuration_timesformer.py +0 -1
- transformers/models/timesformer/modeling_timesformer.py +13 -16
- transformers/models/timm_backbone/configuration_timm_backbone.py +33 -8
- transformers/models/timm_backbone/modeling_timm_backbone.py +25 -30
- transformers/models/timm_wrapper/configuration_timm_wrapper.py +2 -3
- transformers/models/timm_wrapper/image_processing_timm_wrapper.py +4 -5
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +22 -19
- transformers/models/trocr/configuration_trocr.py +11 -8
- transformers/models/trocr/modeling_trocr.py +42 -42
- transformers/models/trocr/processing_trocr.py +5 -25
- transformers/models/tvp/configuration_tvp.py +10 -36
- transformers/models/tvp/image_processing_tvp.py +50 -52
- transformers/models/tvp/image_processing_tvp_fast.py +15 -15
- transformers/models/tvp/modeling_tvp.py +26 -28
- transformers/models/tvp/processing_tvp.py +2 -14
- transformers/models/udop/configuration_udop.py +16 -8
- transformers/models/udop/modeling_udop.py +73 -72
- transformers/models/udop/processing_udop.py +7 -26
- transformers/models/udop/tokenization_udop.py +80 -93
- transformers/models/umt5/configuration_umt5.py +8 -7
- transformers/models/umt5/modeling_umt5.py +87 -84
- transformers/models/unispeech/configuration_unispeech.py +4 -2
- transformers/models/unispeech/modeling_unispeech.py +54 -53
- transformers/models/unispeech/modular_unispeech.py +20 -22
- transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -2
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +70 -69
- transformers/models/unispeech_sat/modular_unispeech_sat.py +21 -23
- transformers/models/univnet/feature_extraction_univnet.py +14 -14
- transformers/models/univnet/modeling_univnet.py +7 -8
- transformers/models/upernet/configuration_upernet.py +8 -36
- transformers/models/upernet/modeling_upernet.py +11 -14
- transformers/models/vaultgemma/__init__.py +0 -1
- transformers/models/vaultgemma/configuration_vaultgemma.py +29 -33
- transformers/models/vaultgemma/modeling_vaultgemma.py +38 -40
- transformers/models/vaultgemma/modular_vaultgemma.py +29 -31
- transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
- transformers/models/video_llama_3/image_processing_video_llama_3.py +40 -40
- transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +12 -14
- transformers/models/video_llama_3/modeling_video_llama_3.py +149 -112
- transformers/models/video_llama_3/modular_video_llama_3.py +152 -150
- transformers/models/video_llama_3/processing_video_llama_3.py +5 -39
- transformers/models/video_llama_3/video_processing_video_llama_3.py +45 -24
- transformers/models/video_llava/configuration_video_llava.py +4 -1
- transformers/models/video_llava/image_processing_video_llava.py +35 -38
- transformers/models/video_llava/modeling_video_llava.py +139 -143
- transformers/models/video_llava/processing_video_llava.py +38 -78
- transformers/models/video_llava/video_processing_video_llava.py +0 -1
- transformers/models/videomae/configuration_videomae.py +0 -1
- transformers/models/videomae/image_processing_videomae.py +31 -34
- transformers/models/videomae/modeling_videomae.py +17 -20
- transformers/models/videomae/video_processing_videomae.py +0 -1
- transformers/models/vilt/configuration_vilt.py +4 -2
- transformers/models/vilt/image_processing_vilt.py +29 -30
- transformers/models/vilt/image_processing_vilt_fast.py +15 -16
- transformers/models/vilt/modeling_vilt.py +103 -90
- transformers/models/vilt/processing_vilt.py +2 -14
- transformers/models/vipllava/configuration_vipllava.py +4 -1
- transformers/models/vipllava/modeling_vipllava.py +92 -67
- transformers/models/vipllava/modular_vipllava.py +78 -54
- transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +0 -1
- transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +28 -27
- transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +0 -1
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +45 -41
- transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +2 -16
- transformers/models/visual_bert/configuration_visual_bert.py +6 -2
- transformers/models/visual_bert/modeling_visual_bert.py +90 -92
- transformers/models/vit/configuration_vit.py +2 -3
- transformers/models/vit/image_processing_vit.py +19 -22
- transformers/models/vit/image_processing_vit_fast.py +0 -1
- transformers/models/vit/modeling_vit.py +20 -20
- transformers/models/vit_mae/configuration_vit_mae.py +0 -1
- transformers/models/vit_mae/modeling_vit_mae.py +32 -30
- transformers/models/vit_msn/configuration_vit_msn.py +0 -1
- transformers/models/vit_msn/modeling_vit_msn.py +21 -19
- transformers/models/vitdet/configuration_vitdet.py +2 -5
- transformers/models/vitdet/modeling_vitdet.py +14 -17
- transformers/models/vitmatte/configuration_vitmatte.py +7 -39
- transformers/models/vitmatte/image_processing_vitmatte.py +15 -18
- transformers/models/vitmatte/image_processing_vitmatte_fast.py +16 -17
- transformers/models/vitmatte/modeling_vitmatte.py +10 -12
- transformers/models/vitpose/configuration_vitpose.py +7 -47
- transformers/models/vitpose/image_processing_vitpose.py +24 -25
- transformers/models/vitpose/image_processing_vitpose_fast.py +9 -10
- transformers/models/vitpose/modeling_vitpose.py +15 -15
- transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -5
- transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +13 -16
- transformers/models/vits/configuration_vits.py +4 -1
- transformers/models/vits/modeling_vits.py +43 -42
- transformers/models/vits/tokenization_vits.py +3 -4
- transformers/models/vivit/configuration_vivit.py +0 -1
- transformers/models/vivit/image_processing_vivit.py +36 -39
- transformers/models/vivit/modeling_vivit.py +9 -11
- transformers/models/vjepa2/__init__.py +0 -1
- transformers/models/vjepa2/configuration_vjepa2.py +0 -1
- transformers/models/vjepa2/modeling_vjepa2.py +39 -41
- transformers/models/vjepa2/video_processing_vjepa2.py +0 -1
- transformers/models/voxtral/__init__.py +0 -1
- transformers/models/voxtral/configuration_voxtral.py +0 -2
- transformers/models/voxtral/modeling_voxtral.py +41 -48
- transformers/models/voxtral/modular_voxtral.py +35 -38
- transformers/models/voxtral/processing_voxtral.py +25 -48
- transformers/models/wav2vec2/configuration_wav2vec2.py +4 -2
- transformers/models/wav2vec2/feature_extraction_wav2vec2.py +7 -10
- transformers/models/wav2vec2/modeling_wav2vec2.py +74 -126
- transformers/models/wav2vec2/processing_wav2vec2.py +6 -35
- transformers/models/wav2vec2/tokenization_wav2vec2.py +20 -332
- transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -2
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +49 -52
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +45 -48
- transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +6 -35
- transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -2
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +62 -65
- transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +15 -18
- transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +16 -17
- transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +36 -55
- transformers/models/wavlm/configuration_wavlm.py +4 -2
- transformers/models/wavlm/modeling_wavlm.py +49 -49
- transformers/models/wavlm/modular_wavlm.py +4 -5
- transformers/models/whisper/configuration_whisper.py +6 -5
- transformers/models/whisper/english_normalizer.py +3 -4
- transformers/models/whisper/feature_extraction_whisper.py +9 -24
- transformers/models/whisper/generation_whisper.py +26 -49
- transformers/models/whisper/modeling_whisper.py +71 -73
- transformers/models/whisper/processing_whisper.py +3 -20
- transformers/models/whisper/tokenization_whisper.py +9 -30
- transformers/models/x_clip/configuration_x_clip.py +4 -2
- transformers/models/x_clip/modeling_x_clip.py +94 -96
- transformers/models/x_clip/processing_x_clip.py +2 -14
- transformers/models/xcodec/configuration_xcodec.py +4 -6
- transformers/models/xcodec/modeling_xcodec.py +15 -17
- transformers/models/xglm/configuration_xglm.py +9 -8
- transformers/models/xglm/modeling_xglm.py +49 -55
- transformers/models/xglm/tokenization_xglm.py +1 -4
- transformers/models/xlm/configuration_xlm.py +10 -8
- transformers/models/xlm/modeling_xlm.py +127 -131
- transformers/models/xlm/tokenization_xlm.py +3 -5
- transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -3
- transformers/models/xlm_roberta/modeling_xlm_roberta.py +96 -98
- transformers/models/xlm_roberta/modular_xlm_roberta.py +50 -53
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +1 -4
- transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -2
- transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +97 -99
- transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +67 -70
- transformers/models/xlnet/configuration_xlnet.py +3 -12
- transformers/models/xlnet/modeling_xlnet.py +149 -162
- transformers/models/xlnet/tokenization_xlnet.py +1 -4
- transformers/models/xlstm/configuration_xlstm.py +8 -12
- transformers/models/xlstm/modeling_xlstm.py +61 -96
- transformers/models/xmod/configuration_xmod.py +11 -3
- transformers/models/xmod/modeling_xmod.py +111 -116
- transformers/models/yolos/configuration_yolos.py +0 -1
- transformers/models/yolos/image_processing_yolos.py +60 -62
- transformers/models/yolos/image_processing_yolos_fast.py +42 -45
- transformers/models/yolos/modeling_yolos.py +19 -21
- transformers/models/yolos/modular_yolos.py +17 -19
- transformers/models/yoso/configuration_yoso.py +8 -2
- transformers/models/yoso/modeling_yoso.py +60 -62
- transformers/models/youtu/__init__.py +27 -0
- transformers/models/youtu/configuration_youtu.py +194 -0
- transformers/models/youtu/modeling_youtu.py +619 -0
- transformers/models/youtu/modular_youtu.py +254 -0
- transformers/models/zamba/configuration_zamba.py +5 -8
- transformers/models/zamba/modeling_zamba.py +93 -125
- transformers/models/zamba2/configuration_zamba2.py +44 -50
- transformers/models/zamba2/modeling_zamba2.py +137 -165
- transformers/models/zamba2/modular_zamba2.py +79 -74
- transformers/models/zoedepth/configuration_zoedepth.py +17 -41
- transformers/models/zoedepth/image_processing_zoedepth.py +28 -29
- transformers/models/zoedepth/image_processing_zoedepth_fast.py +20 -21
- transformers/models/zoedepth/modeling_zoedepth.py +19 -19
- transformers/pipelines/__init__.py +47 -106
- transformers/pipelines/any_to_any.py +15 -23
- transformers/pipelines/audio_utils.py +1 -2
- transformers/pipelines/automatic_speech_recognition.py +0 -2
- transformers/pipelines/base.py +13 -17
- transformers/pipelines/image_text_to_text.py +1 -2
- transformers/pipelines/question_answering.py +4 -43
- transformers/pipelines/text_classification.py +1 -14
- transformers/pipelines/text_to_audio.py +5 -1
- transformers/pipelines/token_classification.py +1 -22
- transformers/pipelines/video_classification.py +1 -9
- transformers/pipelines/zero_shot_audio_classification.py +0 -1
- transformers/pipelines/zero_shot_classification.py +0 -6
- transformers/pipelines/zero_shot_image_classification.py +0 -7
- transformers/processing_utils.py +128 -137
- transformers/pytorch_utils.py +2 -26
- transformers/quantizers/base.py +10 -0
- transformers/quantizers/quantizer_compressed_tensors.py +7 -5
- transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
- transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
- transformers/quantizers/quantizer_mxfp4.py +1 -1
- transformers/quantizers/quantizer_quark.py +0 -1
- transformers/quantizers/quantizer_torchao.py +3 -19
- transformers/safetensors_conversion.py +11 -4
- transformers/testing_utils.py +6 -65
- transformers/tokenization_mistral_common.py +563 -903
- transformers/tokenization_python.py +6 -4
- transformers/tokenization_utils_base.py +228 -341
- transformers/tokenization_utils_sentencepiece.py +5 -6
- transformers/tokenization_utils_tokenizers.py +36 -7
- transformers/trainer.py +30 -41
- transformers/trainer_jit_checkpoint.py +1 -2
- transformers/trainer_seq2seq.py +1 -1
- transformers/training_args.py +414 -420
- transformers/utils/__init__.py +1 -4
- transformers/utils/attention_visualizer.py +1 -1
- transformers/utils/auto_docstring.py +567 -18
- transformers/utils/backbone_utils.py +13 -373
- transformers/utils/doc.py +4 -36
- transformers/utils/dummy_pt_objects.py +0 -42
- transformers/utils/generic.py +70 -34
- transformers/utils/import_utils.py +72 -75
- transformers/utils/loading_report.py +135 -107
- transformers/utils/quantization_config.py +8 -31
- transformers/video_processing_utils.py +24 -25
- transformers/video_utils.py +21 -23
- {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/METADATA +120 -239
- transformers-5.1.0.dist-info/RECORD +2092 -0
- {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
- transformers/pipelines/deprecated/text2text_generation.py +0 -408
- transformers/pipelines/image_to_text.py +0 -229
- transformers-5.0.0rc2.dist-info/RECORD +0 -2042
- {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
- {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
|
@@ -14,39 +14,39 @@
|
|
|
14
14
|
import os
|
|
15
15
|
import re
|
|
16
16
|
import shutil
|
|
17
|
-
import
|
|
18
|
-
from collections.abc import Callable, Mapping, Sized
|
|
17
|
+
from collections.abc import Callable, Sequence
|
|
19
18
|
from enum import Enum
|
|
20
19
|
from pathlib import Path
|
|
21
|
-
from typing import Any, Union, overload
|
|
20
|
+
from typing import Any, Literal, Union, overload
|
|
22
21
|
|
|
23
22
|
import numpy as np
|
|
24
23
|
from huggingface_hub import create_repo
|
|
25
24
|
|
|
26
25
|
from transformers.audio_utils import load_audio_as
|
|
27
26
|
from transformers.tokenization_utils_base import (
|
|
28
|
-
LARGE_INTEGER,
|
|
29
27
|
VERY_LARGE_INTEGER,
|
|
28
|
+
AddedToken,
|
|
30
29
|
BatchEncoding,
|
|
31
30
|
EncodedInput,
|
|
32
31
|
PreTokenizedInput,
|
|
32
|
+
PreTrainedTokenizerBase,
|
|
33
33
|
TextInput,
|
|
34
34
|
TruncationStrategy,
|
|
35
35
|
)
|
|
36
36
|
from transformers.utils import PaddingStrategy, TensorType, add_end_docstrings, logging, to_py_obj
|
|
37
|
-
from transformers.utils.generic import is_torch_tensor
|
|
38
|
-
from transformers.utils.hub import PushToHubMixin
|
|
39
37
|
from transformers.utils.import_utils import is_mistral_common_available, is_torch_available, requires
|
|
40
38
|
|
|
41
39
|
|
|
42
40
|
if is_mistral_common_available():
|
|
43
41
|
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
|
44
42
|
from mistral_common.protocol.instruct.validator import ValidationMode
|
|
45
|
-
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy,
|
|
46
|
-
from mistral_common.tokens.tokenizers.image import MultiModalVersion
|
|
43
|
+
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, SpecialTokens
|
|
47
44
|
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
|
48
45
|
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
|
|
49
|
-
from mistral_common.tokens.tokenizers.utils import
|
|
46
|
+
from mistral_common.tokens.tokenizers.utils import (
|
|
47
|
+
download_tokenizer_from_hf_hub,
|
|
48
|
+
get_one_valid_tokenizer_file,
|
|
49
|
+
)
|
|
50
50
|
|
|
51
51
|
|
|
52
52
|
if is_torch_available():
|
|
@@ -103,6 +103,10 @@ ENCODE_KWARGS_DOCSTRING = r"""
|
|
|
103
103
|
"""
|
|
104
104
|
|
|
105
105
|
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
|
|
106
|
+
return_token_type_ids (`bool`, *optional*):
|
|
107
|
+
Whether to return token type IDs. For `MistralCommonBackend` it returns a list of zeros of the sequence length as only one sequence is supported.
|
|
108
|
+
|
|
109
|
+
[What are token type IDs?](../glossary#token-type-ids)
|
|
106
110
|
return_attention_mask (`bool`, *optional*):
|
|
107
111
|
Whether to return the attention mask. If left to the default, will return the attention mask according
|
|
108
112
|
to the specific tokenizer's default, defined by the `return_outputs` attribute.
|
|
@@ -118,6 +122,8 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
|
|
|
118
122
|
Whether or not to return the lengths of the encoded inputs.
|
|
119
123
|
verbose (`bool`, *optional*, defaults to `True`):
|
|
120
124
|
Whether or not to print more information and warnings.
|
|
125
|
+
return_offsets_mapping (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
|
|
126
|
+
split_special_tokens (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
|
|
121
127
|
**kwargs: passed to the `self.tokenize()` method
|
|
122
128
|
|
|
123
129
|
Return:
|
|
@@ -149,8 +155,35 @@ class MistralTokenizerType(str, Enum):
|
|
|
149
155
|
tekken = "tekken"
|
|
150
156
|
|
|
151
157
|
|
|
158
|
+
@overload
|
|
159
|
+
def _maybe_remove_lang(text: str, skip_special_tokens: bool) -> str: ...
|
|
160
|
+
@overload
|
|
161
|
+
def _maybe_remove_lang(text: list[str], skip_special_tokens: bool) -> list[str]: ...
|
|
162
|
+
def _maybe_remove_lang(text: str | list[str], skip_special_tokens: bool) -> str | list[str]:
|
|
163
|
+
# in the specific case of Voxtral, the added f"lang:xx" (always a two char language code since it follows ISO 639-1 alpha-2 format)
|
|
164
|
+
# is not considered as a special token by mistral-common and is encoded/ decoded as normal text.
|
|
165
|
+
# Nevertheless we should remove it to ease users life.
|
|
166
|
+
if not skip_special_tokens:
|
|
167
|
+
return text
|
|
168
|
+
|
|
169
|
+
if isinstance(text, str):
|
|
170
|
+
return re.sub(r"^lang:[a-z]{2}", "", text)
|
|
171
|
+
|
|
172
|
+
return [re.sub(r"^lang:[a-z]{2}", "", string) for string in text]
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
_MAP_SPECIAL_TOKENS = {
|
|
176
|
+
"bos_token": SpecialTokens.bos.value,
|
|
177
|
+
"eos_token": SpecialTokens.eos.value,
|
|
178
|
+
"pad_token": SpecialTokens.pad.value,
|
|
179
|
+
"unk_token": SpecialTokens.unk.value,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
_VALID_INIT_KWARGS = {"_from_auto", "backend", "files_loaded"}
|
|
183
|
+
|
|
184
|
+
|
|
152
185
|
@requires(backends=("mistral-common",))
|
|
153
|
-
class MistralCommonBackend(
|
|
186
|
+
class MistralCommonBackend(PreTrainedTokenizerBase):
|
|
154
187
|
"""
|
|
155
188
|
Class to wrap `mistral-common` tokenizers.
|
|
156
189
|
|
|
@@ -165,34 +198,13 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
165
198
|
For more info on `mistral-common`, see [mistral-common](https://github.com/mistralai/mistral-common).
|
|
166
199
|
|
|
167
200
|
This class is a wrapper around a `mistral_common.tokens.tokenizers.mistral.MistralTokenizer`.
|
|
168
|
-
It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer.
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
-
|
|
173
|
-
This is a lossy conversion for Tekkenizer as some decoding errors are collapsed into the same token.
|
|
174
|
-
- [`~MistralCommonBackend.encode`]: Encode a string to a list of integers.
|
|
175
|
-
- [`~MistralCommonBackend.decode`]: Decode a list of integers to a string.
|
|
176
|
-
- [`~MistralCommonBackend.batch_decode`]: Decode a batch of list of integers to a list of strings.
|
|
177
|
-
- [`~MistralCommonBackend.convert_tokens_to_ids`]: Convert a list of tokens to a list of integers.
|
|
178
|
-
- [`~MistralCommonBackend.convert_ids_to_tokens`]: Convert a list of integers to a list of tokens.
|
|
179
|
-
- [`~MistralCommonBackend.tokenize`]: Tokenize a string.
|
|
180
|
-
- [`~MistralCommonBackend.get_special_tokens_mask`]: Get the special tokens mask for a list of tokens.
|
|
181
|
-
- [`~MistralCommonBackend.prepare_for_model`]: Prepare a list of inputs for the model.
|
|
182
|
-
- [`~MistralCommonBackend.pad`]: Pad a list of inputs to the same length.
|
|
183
|
-
- [`~MistralCommonBackend.truncate_sequences`]: Truncate a list of sequences to the same length.
|
|
184
|
-
- [`~MistralCommonBackend.apply_chat_template`]: Apply a chat template to a list of messages.
|
|
185
|
-
- [`~MistralCommonBackend.__call__`]: Tokenize a string or a list of strings.
|
|
186
|
-
- [`~MistralCommonBackend.from_pretrained`]: Download and cache a pretrained tokenizer from the Hugging Face model hub or local directory.
|
|
187
|
-
- [`~MistralCommonBackend.save_pretrained`]: Save a tokenizer to a directory, so it can be reloaded using the `from_pretrained` class method.
|
|
188
|
-
- [`~MistralCommonBackend.push_to_hub`]: Upload tokenizer to the Hugging Face model hub.
|
|
189
|
-
|
|
190
|
-
Here are the key differences with the `PreTrainedTokenizerBase` class:
|
|
191
|
-
|
|
192
|
-
- Pair of sequences are not supported. The signature have been kept for compatibility but all arguments related to pair of sequences are ignored. The return values of pairs are returned as `None`.
|
|
201
|
+
It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer and inherits from the `PreTrainedTokenizerBase` class.
|
|
202
|
+
|
|
203
|
+
Here are the key behavior differences with the `PythonBackend` class:
|
|
204
|
+
|
|
205
|
+
- Pair of sequences are not supported. The signature has been kept for compatibility but all arguments related to pair of sequences are ignored. The return values for pairs are returned as `None`.
|
|
193
206
|
- The `is_split_into_words` argument is not supported.
|
|
194
|
-
-
|
|
195
|
-
- It is not possible to add new tokens to the tokenizer. Also the special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("<s>")` will not return the ID of the `<s>` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `"<s>"`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens).
|
|
207
|
+
- It is not possible to add new tokens to the tokenizer. Special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("<s>")` will not return the ID of the `<s>` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `"<s>"`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens).
|
|
196
208
|
|
|
197
209
|
If you have suggestions to improve this class, please open an issue on the [mistral-common GitHub repository](https://github.com/mistralai/mistral-common/issues) if it is related to the tokenizer or on the [Transformers GitHub repository](https://github.com/huggingface/transformers/issues) if it is related to the Hugging Face interface.
|
|
198
210
|
"""
|
|
@@ -200,6 +212,12 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
200
212
|
model_input_names: list[str] = ["input_ids", "attention_mask"]
|
|
201
213
|
padding_side: str = "left"
|
|
202
214
|
truncation_side: str = "right"
|
|
215
|
+
SPECIAL_TOKENS_ATTRIBUTES = [
|
|
216
|
+
"bos_token",
|
|
217
|
+
"eos_token",
|
|
218
|
+
"unk_token",
|
|
219
|
+
"pad_token",
|
|
220
|
+
]
|
|
203
221
|
|
|
204
222
|
def __init__(
|
|
205
223
|
self,
|
|
@@ -226,7 +244,7 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
226
244
|
Path to the tokenizer file to load the `MistralTokenizer`.
|
|
227
245
|
mode (`Union[str, ValidationMode]`, *optional*, defaults to `ValidationMode.test`):
|
|
228
246
|
The mode to use for the tokenizer. This will be passed to the `MistralTokenizer` constructor. Possible values are:
|
|
229
|
-
- `"finetuning"` or `ValidationMode.finetuning`: The
|
|
247
|
+
- `"finetuning"` or `ValidationMode.finetuning`: The fine-tuning mode.
|
|
230
248
|
- `"test"` or `ValidationMode.test`: The test mode.
|
|
231
249
|
It changes how the tokenizer validates the input and prepares the request to the model.
|
|
232
250
|
model_max_length (`int`, *optional*):
|
|
@@ -240,60 +258,49 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
240
258
|
truncation_side (`str`, *optional*):
|
|
241
259
|
The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
|
|
242
260
|
Default value is picked from the class attribute of the same name.
|
|
243
|
-
model_input_names (`List[
|
|
261
|
+
model_input_names (`List[str]`, *optional*):
|
|
244
262
|
The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
|
|
245
263
|
`"attention_mask"`). Default value is picked from the class attribute of the same name.
|
|
246
264
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
|
|
247
|
-
Whether or not the model should
|
|
265
|
+
Whether or not the model should clean up the spaces that were added when splitting the input text during the
|
|
248
266
|
tokenization process.
|
|
249
267
|
"""
|
|
250
|
-
if kwargs:
|
|
268
|
+
if kwargs and not set(kwargs.keys()).issubset(_VALID_INIT_KWARGS):
|
|
251
269
|
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported to init `MistralCommonBackend`.")
|
|
252
270
|
|
|
271
|
+
self.init_kwargs = {
|
|
272
|
+
"tokenizer_path": tokenizer_path,
|
|
273
|
+
"mode": mode,
|
|
274
|
+
"model_max_length": model_max_length,
|
|
275
|
+
"padding_side": padding_side,
|
|
276
|
+
"truncation_side": truncation_side,
|
|
277
|
+
"model_input_names": model_input_names,
|
|
278
|
+
"clean_up_tokenization_spaces": clean_up_tokenization_spaces,
|
|
279
|
+
}
|
|
253
280
|
self._tokenizer_path = Path(tokenizer_path)
|
|
254
281
|
self._mode = self._get_validation_mode(mode)
|
|
282
|
+
|
|
255
283
|
self.tokenizer: MistralTokenizer = MistralTokenizer.from_file(str(self._tokenizer_path), mode=self._mode)
|
|
256
284
|
self._tokenizer_type = (
|
|
257
285
|
MistralTokenizerType.tekken
|
|
258
286
|
if isinstance(self.tokenizer.instruct_tokenizer.tokenizer, Tekkenizer)
|
|
259
287
|
else MistralTokenizerType.spm
|
|
260
288
|
)
|
|
261
|
-
self.truncation_side = truncation_side
|
|
262
|
-
self.padding_side = padding_side
|
|
263
|
-
self.model_max_length = model_max_length
|
|
264
|
-
self.cleanup_tokenization_spaces = clean_up_tokenization_spaces
|
|
265
|
-
self.deprecation_warnings = {} # Use to store when we have already noticed a deprecation warning (avoid overlogging).
|
|
266
|
-
self._all_special_tokens_ids = self._get_all_special_ids()
|
|
267
|
-
|
|
268
|
-
if model_input_names is not None:
|
|
269
|
-
if (
|
|
270
|
-
not isinstance(model_input_names, (list, tuple))
|
|
271
|
-
and len(model_input_names) == 0
|
|
272
|
-
and not all(isinstance(i, str) for i in model_input_names)
|
|
273
|
-
):
|
|
274
|
-
raise ValueError(
|
|
275
|
-
"`model_input_names` should be a non-empty list or tuple of str but got an empty value."
|
|
276
|
-
)
|
|
277
|
-
self.model_input_names = model_input_names
|
|
278
|
-
|
|
279
289
|
self._cache_get_vocab: dict[str, int] | None = None
|
|
280
290
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
.replace(" 's", "'s")
|
|
295
|
-
.replace(" 've", "'ve")
|
|
296
|
-
.replace(" 're", "'re")
|
|
291
|
+
self._all_special_ids = self._get_all_special_ids()
|
|
292
|
+
self._all_special_tokens = self.convert_ids_to_tokens(self.all_special_ids)
|
|
293
|
+
|
|
294
|
+
super().__init__(
|
|
295
|
+
truncation_side=truncation_side,
|
|
296
|
+
padding_side=padding_side,
|
|
297
|
+
model_max_length=model_max_length,
|
|
298
|
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
299
|
+
extra_special_tokens=None, # Not used by this backend.
|
|
300
|
+
model_specific_special_tokens=None, # Not used by this backend.
|
|
301
|
+
model_input_names=model_input_names or self.model_input_names,
|
|
302
|
+
**_MAP_SPECIAL_TOKENS,
|
|
303
|
+
**kwargs,
|
|
297
304
|
)
|
|
298
305
|
|
|
299
306
|
@property
|
|
@@ -306,75 +313,19 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
306
313
|
"""
|
|
307
314
|
return self._mode
|
|
308
315
|
|
|
309
|
-
@property
|
|
310
|
-
def bos_token_id(self) -> int:
|
|
311
|
-
"""
|
|
312
|
-
Id of the beginning of sentence token in the vocabulary.
|
|
313
|
-
"""
|
|
314
|
-
return self.tokenizer.instruct_tokenizer.tokenizer.bos_id
|
|
315
|
-
|
|
316
|
-
@property
|
|
317
|
-
def eos_token_id(self) -> int:
|
|
318
|
-
"""
|
|
319
|
-
Id of the end of sentence token in the vocabulary.
|
|
320
|
-
"""
|
|
321
|
-
return self.tokenizer.instruct_tokenizer.tokenizer.eos_id
|
|
322
|
-
|
|
323
|
-
@property
|
|
324
|
-
def unk_token_id(self) -> int:
|
|
325
|
-
"""
|
|
326
|
-
Id of the unknown token in the vocabulary.
|
|
327
|
-
"""
|
|
328
|
-
return self.tokenizer.instruct_tokenizer.tokenizer.unk_id
|
|
329
|
-
|
|
330
|
-
@property
|
|
331
|
-
def pad_token_id(self) -> int:
|
|
332
|
-
"""
|
|
333
|
-
Id of the padding token in the vocabulary.
|
|
334
|
-
"""
|
|
335
|
-
return self.tokenizer.instruct_tokenizer.tokenizer.pad_id
|
|
336
|
-
|
|
337
|
-
@property
|
|
338
|
-
def bos_token(self) -> str:
|
|
339
|
-
"""
|
|
340
|
-
String associated to the beginning of sentence token in the vocabulary.
|
|
341
|
-
"""
|
|
342
|
-
return self.convert_ids_to_tokens(self.bos_token_id)
|
|
343
|
-
|
|
344
|
-
@property
|
|
345
|
-
def eos_token(self) -> str:
|
|
346
|
-
"""
|
|
347
|
-
String associated to the end of sentence token in the vocabulary.
|
|
348
|
-
"""
|
|
349
|
-
return self.convert_ids_to_tokens(self.eos_token_id)
|
|
350
|
-
|
|
351
|
-
@property
|
|
352
|
-
def unk_token(self) -> str:
|
|
353
|
-
"""
|
|
354
|
-
String associated to the unknown token in the vocabulary.
|
|
355
|
-
"""
|
|
356
|
-
return self.convert_ids_to_tokens(self.unk_token_id)
|
|
357
|
-
|
|
358
|
-
@property
|
|
359
|
-
def pad_token(self) -> str:
|
|
360
|
-
"""
|
|
361
|
-
String associated to the padding token in the vocabulary.
|
|
362
|
-
"""
|
|
363
|
-
return self.convert_ids_to_tokens(self.pad_token_id)
|
|
364
|
-
|
|
365
316
|
@property
|
|
366
317
|
def all_special_ids(self) -> list[int]:
|
|
367
318
|
"""
|
|
368
319
|
`list[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.).
|
|
369
320
|
"""
|
|
370
|
-
return sorted(self.
|
|
321
|
+
return sorted(self._all_special_ids)
|
|
371
322
|
|
|
372
323
|
@property
|
|
373
324
|
def all_special_tokens(self) -> list[str]:
|
|
374
325
|
"""
|
|
375
326
|
`list[str]`: A list of all unique special tokens.
|
|
376
327
|
"""
|
|
377
|
-
return self.
|
|
328
|
+
return self._all_special_tokens
|
|
378
329
|
|
|
379
330
|
@property
|
|
380
331
|
def vocab_size(self) -> int:
|
|
@@ -435,6 +386,8 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
435
386
|
padding_side: str | None = None,
|
|
436
387
|
return_tensors: str | TensorType | None = None,
|
|
437
388
|
verbose: bool = True,
|
|
389
|
+
return_offsets_mapping: Literal[False] = False,
|
|
390
|
+
split_special_tokens: Literal[False] = False,
|
|
438
391
|
**kwargs,
|
|
439
392
|
) -> list[int]:
|
|
440
393
|
"""
|
|
@@ -446,37 +399,81 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
446
399
|
text_pair (`None`, *optional*):
|
|
447
400
|
Not supported by `MistralCommonBackend.encode`. Kept to match `PreTrainedTokenizerBase.encode` signature.
|
|
448
401
|
"""
|
|
402
|
+
if return_offsets_mapping or split_special_tokens:
|
|
403
|
+
raise ValueError(
|
|
404
|
+
"`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND, "only_first", "only_second"]:
|
|
408
|
+
raise ValueError(
|
|
409
|
+
"Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
|
|
410
|
+
)
|
|
411
|
+
|
|
449
412
|
if kwargs:
|
|
450
413
|
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.encode`.")
|
|
414
|
+
|
|
451
415
|
if text_pair:
|
|
452
416
|
raise ValueError("`MistralCommonBackend.encode` does not support `text_pair`.")
|
|
453
417
|
|
|
454
|
-
|
|
418
|
+
return super().encode(
|
|
419
|
+
text=text,
|
|
420
|
+
text_pair=text_pair,
|
|
421
|
+
add_special_tokens=add_special_tokens,
|
|
455
422
|
padding=padding,
|
|
456
423
|
truncation=truncation,
|
|
457
424
|
max_length=max_length,
|
|
458
|
-
pad_to_multiple_of=pad_to_multiple_of,
|
|
459
|
-
verbose=verbose,
|
|
460
|
-
)
|
|
461
|
-
|
|
462
|
-
encoded_inputs = self._encode_plus(
|
|
463
|
-
text,
|
|
464
|
-
add_special_tokens=add_special_tokens,
|
|
465
|
-
padding_strategy=padding_strategy,
|
|
466
|
-
truncation_strategy=truncation_strategy,
|
|
467
|
-
max_length=max_length,
|
|
468
425
|
stride=stride,
|
|
426
|
+
return_tensors=return_tensors,
|
|
469
427
|
pad_to_multiple_of=pad_to_multiple_of,
|
|
470
428
|
padding_side=padding_side,
|
|
471
|
-
return_tensors=return_tensors,
|
|
472
|
-
return_attention_mask=False,
|
|
473
|
-
return_overflowing_tokens=False,
|
|
474
|
-
return_special_tokens_mask=False,
|
|
475
|
-
return_length=False,
|
|
476
429
|
verbose=verbose,
|
|
477
430
|
)
|
|
478
431
|
|
|
479
|
-
|
|
432
|
+
def _decode(
|
|
433
|
+
self,
|
|
434
|
+
token_ids: int | list[int],
|
|
435
|
+
skip_special_tokens: bool = False,
|
|
436
|
+
clean_up_tokenization_spaces: bool | None = None,
|
|
437
|
+
**kwargs,
|
|
438
|
+
) -> str:
|
|
439
|
+
if kwargs:
|
|
440
|
+
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.decode`.")
|
|
441
|
+
|
|
442
|
+
token_ids = to_py_obj(token_ids)
|
|
443
|
+
|
|
444
|
+
if isinstance(token_ids, int):
|
|
445
|
+
token_ids = [token_ids]
|
|
446
|
+
|
|
447
|
+
special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
|
|
448
|
+
|
|
449
|
+
text = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
|
|
450
|
+
|
|
451
|
+
# Apply tokenizer-specific cleanup if available and requested
|
|
452
|
+
clean_up_tokenization_spaces = (
|
|
453
|
+
clean_up_tokenization_spaces
|
|
454
|
+
if clean_up_tokenization_spaces is not None
|
|
455
|
+
else self.clean_up_tokenization_spaces
|
|
456
|
+
)
|
|
457
|
+
if clean_up_tokenization_spaces:
|
|
458
|
+
# Call custom cleanup method if it exists (e.g., for CLVP's [SPACE] token replacement)
|
|
459
|
+
if hasattr(self, "clean_up_tokenization") and callable(self.clean_up_tokenization):
|
|
460
|
+
text = self.clean_up_tokenization(text)
|
|
461
|
+
else:
|
|
462
|
+
# Otherwise apply standard cleanup
|
|
463
|
+
text = (
|
|
464
|
+
text.replace(" .", ".")
|
|
465
|
+
.replace(" ?", "?")
|
|
466
|
+
.replace(" !", "!")
|
|
467
|
+
.replace(" ,", ",")
|
|
468
|
+
.replace(" ' ", "'")
|
|
469
|
+
.replace(" n't", "n't")
|
|
470
|
+
.replace(" 'm", "'m")
|
|
471
|
+
.replace(" 's", "'s")
|
|
472
|
+
.replace(" 've", "'ve")
|
|
473
|
+
.replace(" 're", "'re")
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
return _maybe_remove_lang(text=text, skip_special_tokens=skip_special_tokens)
|
|
480
477
|
|
|
481
478
|
def decode(
|
|
482
479
|
self,
|
|
@@ -484,7 +481,7 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
484
481
|
skip_special_tokens: bool = False,
|
|
485
482
|
clean_up_tokenization_spaces: bool | None = None,
|
|
486
483
|
**kwargs,
|
|
487
|
-
) ->
|
|
484
|
+
) -> str | list[str]:
|
|
488
485
|
"""
|
|
489
486
|
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
|
|
490
487
|
tokens and clean up tokenization spaces.
|
|
@@ -509,16 +506,7 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
509
506
|
if kwargs:
|
|
510
507
|
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.decode`.")
|
|
511
508
|
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
if isinstance(token_ids, (list, tuple)) and len(token_ids) > 0 and isinstance(token_ids[0], (list, tuple)):
|
|
515
|
-
return self._batch_decode(
|
|
516
|
-
sequences=token_ids,
|
|
517
|
-
skip_special_tokens=skip_special_tokens,
|
|
518
|
-
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
519
|
-
)
|
|
520
|
-
|
|
521
|
-
return self._decode(
|
|
509
|
+
return super().decode(
|
|
522
510
|
token_ids=token_ids,
|
|
523
511
|
skip_special_tokens=skip_special_tokens,
|
|
524
512
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
@@ -555,63 +543,12 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
555
543
|
if kwargs:
|
|
556
544
|
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.batch_decode`.")
|
|
557
545
|
|
|
558
|
-
return
|
|
546
|
+
return super().batch_decode(
|
|
559
547
|
sequences=sequences,
|
|
560
548
|
skip_special_tokens=skip_special_tokens,
|
|
561
549
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
562
550
|
)
|
|
563
551
|
|
|
564
|
-
def _decode(
|
|
565
|
-
self,
|
|
566
|
-
token_ids: Union[int, list[int], list[list[int]], np.ndarray, "torch.Tensor"],
|
|
567
|
-
skip_special_tokens: bool = False,
|
|
568
|
-
clean_up_tokenization_spaces: bool | None = None,
|
|
569
|
-
) -> str:
|
|
570
|
-
clean_up_tokenization_spaces = clean_up_tokenization_spaces or self.cleanup_tokenization_spaces
|
|
571
|
-
|
|
572
|
-
# Convert inputs to python lists
|
|
573
|
-
if isinstance(token_ids, int):
|
|
574
|
-
token_ids = [token_ids]
|
|
575
|
-
|
|
576
|
-
token_ids = to_py_obj(token_ids)
|
|
577
|
-
|
|
578
|
-
special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
|
|
579
|
-
|
|
580
|
-
decoded_string = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
|
|
581
|
-
if clean_up_tokenization_spaces:
|
|
582
|
-
decoded_string = self.clean_up_tokenization(decoded_string)
|
|
583
|
-
|
|
584
|
-
# in the specific case of Voxtral, the added f"lang:xx" (always a two char language code since it follows ISO 639-1 alpha-2 format)
|
|
585
|
-
# is not considered as a special token by mistral-common and is encoded/ decoded as normal text.
|
|
586
|
-
# Nevertheless we should remove it to ease users life.
|
|
587
|
-
if skip_special_tokens:
|
|
588
|
-
decoded_string = re.sub(r"^lang:[a-z]{2}", "", decoded_string)
|
|
589
|
-
|
|
590
|
-
return decoded_string
|
|
591
|
-
|
|
592
|
-
def _batch_decode(
|
|
593
|
-
self,
|
|
594
|
-
sequences: Union[list[int], list[list[int]], np.ndarray, "torch.Tensor"],
|
|
595
|
-
skip_special_tokens: bool = False,
|
|
596
|
-
clean_up_tokenization_spaces: bool | None = None,
|
|
597
|
-
) -> list[str]:
|
|
598
|
-
return [
|
|
599
|
-
self._decode(
|
|
600
|
-
seq,
|
|
601
|
-
skip_special_tokens=skip_special_tokens,
|
|
602
|
-
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
603
|
-
)
|
|
604
|
-
for seq in sequences
|
|
605
|
-
]
|
|
606
|
-
|
|
607
|
-
def _is_control_token(self, token_id: int) -> bool:
|
|
608
|
-
if self._tokenizer_type == MistralTokenizerType.spm:
|
|
609
|
-
return token_id in self.tokenizer.instruct_tokenizer.tokenizer._control_tokens
|
|
610
|
-
elif self._tokenizer_type == MistralTokenizerType.tekken:
|
|
611
|
-
return token_id < self.tokenizer.instruct_tokenizer.tokenizer.num_special_tokens
|
|
612
|
-
else:
|
|
613
|
-
raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
|
|
614
|
-
|
|
615
552
|
@overload
|
|
616
553
|
def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
|
|
617
554
|
@overload
|
|
@@ -632,22 +569,22 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
632
569
|
"""
|
|
633
570
|
|
|
634
571
|
if isinstance(ids, int):
|
|
635
|
-
|
|
572
|
+
return_int = True
|
|
636
573
|
ids = [ids]
|
|
637
574
|
else:
|
|
638
|
-
|
|
575
|
+
return_int = False
|
|
639
576
|
|
|
640
577
|
tokens: list[str] = []
|
|
641
578
|
for token_id in ids:
|
|
642
|
-
if self.
|
|
579
|
+
if self.tokenizer.instruct_tokenizer.tokenizer.is_special(token_id) and skip_special_tokens:
|
|
643
580
|
continue
|
|
644
581
|
tokens.append(self.tokenizer.instruct_tokenizer.tokenizer.id_to_piece(token_id))
|
|
645
582
|
|
|
646
|
-
if
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
583
|
+
if return_int and tokens == []:
|
|
584
|
+
raise ValueError(f"Invalid token id {ids[0]}.")
|
|
585
|
+
elif return_int:
|
|
650
586
|
return tokens[0]
|
|
587
|
+
|
|
651
588
|
return tokens
|
|
652
589
|
|
|
653
590
|
def _tekken_piece_to_id(self, piece: str, warn: bool) -> int:
|
|
@@ -708,7 +645,13 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
708
645
|
tokens_ids = self.tokenizer.instruct_tokenizer.tokenizer.encode(text, bos=add_special_tokens, eos=add_eos)
|
|
709
646
|
return tokens_ids
|
|
710
647
|
|
|
711
|
-
def tokenize(
|
|
648
|
+
def tokenize(
|
|
649
|
+
self,
|
|
650
|
+
text: TextInput,
|
|
651
|
+
return_offsets_mapping: Literal[False] = False,
|
|
652
|
+
split_special_tokens: Literal[False] = False,
|
|
653
|
+
**kwargs,
|
|
654
|
+
) -> list[str]:
|
|
712
655
|
"""
|
|
713
656
|
Converts a string into a sequence of tokens, using the tokenizer.
|
|
714
657
|
|
|
@@ -717,6 +660,8 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
717
660
|
Args:
|
|
718
661
|
text (`str`):
|
|
719
662
|
The sequence to be encoded.
|
|
663
|
+
return_offsets_mapping (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
|
|
664
|
+
split_special_tokens (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
|
|
720
665
|
**kwargs (additional keyword arguments):
|
|
721
666
|
Not supported by `MistralCommonBackend.tokenize`.
|
|
722
667
|
Will raise an error if used.
|
|
@@ -724,40 +669,164 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
724
669
|
Returns:
|
|
725
670
|
`list[str]`: The list of tokens.
|
|
726
671
|
"""
|
|
672
|
+
if return_offsets_mapping or split_special_tokens:
|
|
673
|
+
raise ValueError(
|
|
674
|
+
"`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
|
|
675
|
+
)
|
|
676
|
+
|
|
727
677
|
if kwargs:
|
|
728
678
|
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.tokenize`.")
|
|
729
679
|
|
|
730
680
|
return self.convert_ids_to_tokens(self._text_to_ids(text, add_special_tokens=False), skip_special_tokens=False)
|
|
731
681
|
|
|
732
|
-
def
|
|
682
|
+
def _get_all_special_ids(self) -> set[int]:
|
|
683
|
+
if self._tokenizer_type == MistralTokenizerType.tekken:
|
|
684
|
+
return self.tokenizer.instruct_tokenizer.tokenizer._special_token_ids
|
|
685
|
+
elif self._tokenizer_type == MistralTokenizerType.spm:
|
|
686
|
+
return {
|
|
687
|
+
token_id
|
|
688
|
+
for token_id in range(self.tokenizer.instruct_tokenizer.tokenizer.n_words)
|
|
689
|
+
if self.tokenizer.instruct_tokenizer.tokenizer.is_special(token_id)
|
|
690
|
+
}
|
|
691
|
+
else:
|
|
692
|
+
raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
|
|
693
|
+
|
|
694
|
+
def get_special_tokens_mask(
|
|
695
|
+
self, token_ids_0: list[int], token_ids_1: None = None, already_has_special_tokens: bool = False
|
|
696
|
+
) -> list[int]:
|
|
697
|
+
"""
|
|
698
|
+
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
|
699
|
+
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
|
|
700
|
+
|
|
701
|
+
Args:
|
|
702
|
+
token_ids_0 (`list[int]`): List of ids of the sequence.
|
|
703
|
+
token_ids_1 (`None`, *optional*): None, kept to match Transformers' implementation.
|
|
704
|
+
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
|
705
|
+
Whether or not the token list is already formatted with special tokens for the model.
|
|
706
|
+
|
|
707
|
+
Returns:
|
|
708
|
+
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
|
709
|
+
"""
|
|
710
|
+
if token_ids_1 is not None:
|
|
711
|
+
raise ValueError(
|
|
712
|
+
"`token_ids_1` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
if already_has_special_tokens:
|
|
716
|
+
return [1 if int(token_id) in self._all_special_ids else 0 for token_id in token_ids_0]
|
|
717
|
+
|
|
718
|
+
if self.mode == ValidationMode.test:
|
|
719
|
+
# [BOS] seq0
|
|
720
|
+
return [1] + ([0] * len(token_ids_0))
|
|
721
|
+
else:
|
|
722
|
+
# [BOS] seq0 [EOS]
|
|
723
|
+
return [1] + ([0] * len(token_ids_0)) + [1]
|
|
724
|
+
|
|
725
|
+
def _encode_plus( # type: ignore[override]
|
|
733
726
|
self,
|
|
734
|
-
text: TextInput | EncodedInput,
|
|
727
|
+
text: TextInput | PreTokenizedInput | EncodedInput,
|
|
728
|
+
text_pair: None = None,
|
|
735
729
|
add_special_tokens: bool = True,
|
|
736
730
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
|
737
731
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
|
738
732
|
max_length: int | None = None,
|
|
739
733
|
stride: int = 0,
|
|
734
|
+
is_split_into_words: bool = False,
|
|
740
735
|
pad_to_multiple_of: int | None = None,
|
|
741
736
|
padding_side: str | None = None,
|
|
742
737
|
return_tensors: str | TensorType | None = None,
|
|
738
|
+
return_token_type_ids: bool | None = None,
|
|
743
739
|
return_attention_mask: bool | None = None,
|
|
744
740
|
return_overflowing_tokens: bool = False,
|
|
745
741
|
return_special_tokens_mask: bool = False,
|
|
746
742
|
return_length: bool = False,
|
|
747
743
|
verbose: bool = True,
|
|
744
|
+
return_offsets_mapping: Literal[False] = False,
|
|
745
|
+
split_special_tokens: Literal[False] = False,
|
|
746
|
+
**kwargs,
|
|
748
747
|
) -> BatchEncoding:
|
|
748
|
+
# Detect batched inputs (list of sequences)
|
|
749
|
+
if text_pair is not None:
|
|
750
|
+
raise ValueError("`MistralCommonBackend` does not support `text_pair != None` for `_encode_plus`.")
|
|
751
|
+
|
|
752
|
+
if return_offsets_mapping or split_special_tokens:
|
|
753
|
+
raise ValueError(
|
|
754
|
+
"`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
if kwargs:
|
|
758
|
+
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend._encode_plus`.")
|
|
759
|
+
|
|
760
|
+
is_batched = isinstance(text, (list, tuple)) and (
|
|
761
|
+
(not text and not is_split_into_words)
|
|
762
|
+
or (text and is_split_into_words and isinstance(text[0], (list, tuple)))
|
|
763
|
+
or (text and not is_split_into_words and isinstance(text[0], (str, list, tuple)))
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
if is_batched:
|
|
767
|
+
batch_outputs = {}
|
|
768
|
+
one_overflowed = False
|
|
769
|
+
for current_text in text:
|
|
770
|
+
current_output = self._encode_plus(
|
|
771
|
+
text=current_text,
|
|
772
|
+
text_pair=None,
|
|
773
|
+
add_special_tokens=add_special_tokens,
|
|
774
|
+
padding_strategy=PaddingStrategy.DO_NOT_PAD, # we pad in batch afterward
|
|
775
|
+
truncation_strategy=truncation_strategy,
|
|
776
|
+
max_length=max_length,
|
|
777
|
+
stride=stride,
|
|
778
|
+
is_split_into_words=is_split_into_words,
|
|
779
|
+
pad_to_multiple_of=None, # we pad in batch afterward
|
|
780
|
+
padding_side=None, # we pad in batch afterward
|
|
781
|
+
return_tensors=None, # We convert the whole batch to tensors at the end
|
|
782
|
+
return_token_type_ids=return_token_type_ids,
|
|
783
|
+
return_attention_mask=False, # we pad in batch afterward
|
|
784
|
+
return_overflowing_tokens=return_overflowing_tokens,
|
|
785
|
+
return_special_tokens_mask=return_special_tokens_mask,
|
|
786
|
+
return_length=return_length,
|
|
787
|
+
verbose=verbose,
|
|
788
|
+
)
|
|
789
|
+
for key, value in current_output.items():
|
|
790
|
+
batch_outputs.setdefault(key, []).append(value)
|
|
791
|
+
|
|
792
|
+
# To ensure the list is built for each sample, we need to add this.
|
|
793
|
+
if return_overflowing_tokens and not return_tensors:
|
|
794
|
+
if "overflowing_tokens" not in current_output:
|
|
795
|
+
batch_outputs.setdefault("overflowing_tokens", []).append([0])
|
|
796
|
+
batch_outputs.setdefault("num_truncated_tokens", []).append([0])
|
|
797
|
+
else:
|
|
798
|
+
one_overflowed = True
|
|
799
|
+
|
|
800
|
+
# Remove overflow-related keys before tensor conversion if return_tensors is set
|
|
801
|
+
# Slow tokenizers don't support returning these as tensors
|
|
802
|
+
if return_overflowing_tokens and (return_tensors or not one_overflowed):
|
|
803
|
+
batch_outputs.pop("overflowing_tokens", None)
|
|
804
|
+
batch_outputs.pop("num_truncated_tokens", None)
|
|
805
|
+
|
|
806
|
+
batch_outputs = self.pad(
|
|
807
|
+
batch_outputs,
|
|
808
|
+
padding=padding_strategy.value,
|
|
809
|
+
max_length=max_length,
|
|
810
|
+
pad_to_multiple_of=pad_to_multiple_of,
|
|
811
|
+
padding_side=padding_side,
|
|
812
|
+
return_attention_mask=return_attention_mask,
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
return BatchEncoding(batch_outputs, tensor_type=return_tensors)
|
|
816
|
+
|
|
749
817
|
def get_input_ids(text):
|
|
750
818
|
if isinstance(text, str):
|
|
751
|
-
return self._text_to_ids(text,
|
|
819
|
+
return self._text_to_ids(text, False)
|
|
752
820
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
|
|
753
821
|
return text
|
|
754
822
|
else:
|
|
755
823
|
raise ValueError(f"Input {text} is not valid. Should be a string, or a list/tuple of integers.")
|
|
756
824
|
|
|
757
|
-
|
|
825
|
+
first_ids = get_input_ids(text)
|
|
758
826
|
|
|
759
827
|
return self.prepare_for_model(
|
|
760
|
-
|
|
828
|
+
first_ids,
|
|
829
|
+
pair_ids=None,
|
|
761
830
|
add_special_tokens=add_special_tokens,
|
|
762
831
|
padding=padding_strategy.value,
|
|
763
832
|
truncation=truncation_strategy.value,
|
|
@@ -768,202 +837,62 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
768
837
|
return_tensors=return_tensors,
|
|
769
838
|
prepend_batch_axis=True,
|
|
770
839
|
return_attention_mask=return_attention_mask,
|
|
840
|
+
return_token_type_ids=return_token_type_ids,
|
|
771
841
|
return_overflowing_tokens=return_overflowing_tokens,
|
|
772
842
|
return_special_tokens_mask=return_special_tokens_mask,
|
|
773
843
|
return_length=return_length,
|
|
774
844
|
verbose=verbose,
|
|
775
845
|
)
|
|
776
846
|
|
|
777
|
-
|
|
847
|
+
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
|
|
848
|
+
def prepare_for_model(
|
|
778
849
|
self,
|
|
779
|
-
|
|
850
|
+
ids: list[int],
|
|
851
|
+
pair_ids: None = None,
|
|
780
852
|
add_special_tokens: bool = True,
|
|
781
|
-
|
|
782
|
-
|
|
853
|
+
padding: bool | str | PaddingStrategy = False,
|
|
854
|
+
truncation: bool | str | TruncationStrategy | None = None,
|
|
783
855
|
max_length: int | None = None,
|
|
784
856
|
stride: int = 0,
|
|
785
857
|
pad_to_multiple_of: int | None = None,
|
|
786
858
|
padding_side: str | None = None,
|
|
787
859
|
return_tensors: str | TensorType | None = None,
|
|
860
|
+
return_token_type_ids: bool | None = None,
|
|
788
861
|
return_attention_mask: bool | None = None,
|
|
789
862
|
return_overflowing_tokens: bool = False,
|
|
790
863
|
return_special_tokens_mask: bool = False,
|
|
791
864
|
return_length: bool = False,
|
|
792
865
|
verbose: bool = True,
|
|
866
|
+
prepend_batch_axis: bool = False,
|
|
867
|
+
return_offsets_mapping: Literal[False] = False,
|
|
868
|
+
split_special_tokens: Literal[False] = False,
|
|
869
|
+
**kwargs,
|
|
793
870
|
) -> BatchEncoding:
|
|
794
|
-
def get_input_ids(text):
|
|
795
|
-
if isinstance(text, str):
|
|
796
|
-
return self._text_to_ids(text, add_special_tokens)
|
|
797
|
-
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
|
|
798
|
-
return text
|
|
799
|
-
else:
|
|
800
|
-
raise ValueError("Input is not valid. Should be a string or a list/tuple of integers.")
|
|
801
|
-
|
|
802
|
-
input_ids = []
|
|
803
|
-
for ids in batch_text:
|
|
804
|
-
input_ids.append(get_input_ids(ids))
|
|
805
|
-
|
|
806
|
-
batch_outputs = self._batch_prepare_for_model(
|
|
807
|
-
input_ids,
|
|
808
|
-
add_special_tokens=add_special_tokens,
|
|
809
|
-
padding_strategy=padding_strategy,
|
|
810
|
-
truncation_strategy=truncation_strategy,
|
|
811
|
-
max_length=max_length,
|
|
812
|
-
stride=stride,
|
|
813
|
-
pad_to_multiple_of=pad_to_multiple_of,
|
|
814
|
-
padding_side=padding_side,
|
|
815
|
-
return_attention_mask=return_attention_mask,
|
|
816
|
-
return_overflowing_tokens=return_overflowing_tokens,
|
|
817
|
-
return_special_tokens_mask=return_special_tokens_mask,
|
|
818
|
-
return_length=return_length,
|
|
819
|
-
return_tensors=return_tensors,
|
|
820
|
-
verbose=verbose,
|
|
821
|
-
)
|
|
822
|
-
|
|
823
|
-
return BatchEncoding(batch_outputs)
|
|
824
|
-
|
|
825
|
-
def _get_all_special_ids(self) -> set[int]:
|
|
826
|
-
if self._tokenizer_type == MistralTokenizerType.tekken:
|
|
827
|
-
return {t["rank"] for t in self.tokenizer.instruct_tokenizer.tokenizer._all_special_tokens}
|
|
828
|
-
elif self._tokenizer_type == MistralTokenizerType.spm:
|
|
829
|
-
return self.tokenizer.instruct_tokenizer.tokenizer._control_tokens
|
|
830
|
-
else:
|
|
831
|
-
raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
|
|
832
|
-
|
|
833
|
-
def get_special_tokens_mask(
|
|
834
|
-
self, token_ids_0: list, token_ids_1: None = None, already_has_special_tokens: bool = False
|
|
835
|
-
) -> list[int]:
|
|
836
871
|
"""
|
|
837
|
-
|
|
838
|
-
special tokens
|
|
872
|
+
Prepares a sequence of input id so that it can be used by the model. It
|
|
873
|
+
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
|
|
874
|
+
manages a moving window (with user defined stride) for overflowing tokens.
|
|
839
875
|
|
|
840
876
|
Args:
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
877
|
+
ids (`list[int]`):
|
|
878
|
+
Tokenized input ids of the first sequence.
|
|
879
|
+
pair_ids (`None`, *optional*):
|
|
844
880
|
Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
|
|
845
|
-
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
|
846
|
-
Whether or not the token list is already formatted with special tokens for the model.
|
|
847
|
-
|
|
848
|
-
Returns:
|
|
849
|
-
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
|
850
881
|
"""
|
|
851
|
-
if
|
|
882
|
+
if return_offsets_mapping or split_special_tokens:
|
|
852
883
|
raise ValueError(
|
|
853
|
-
"`
|
|
884
|
+
"`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
|
|
854
885
|
)
|
|
855
|
-
|
|
886
|
+
|
|
887
|
+
if pair_ids is not None:
|
|
856
888
|
raise ValueError(
|
|
857
|
-
"`
|
|
889
|
+
"`pair_ids` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
|
|
858
890
|
)
|
|
859
891
|
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
self,
|
|
865
|
-
batch_ids: list[PreTokenizedInput | list[int]],
|
|
866
|
-
add_special_tokens: bool = True,
|
|
867
|
-
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
|
868
|
-
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
|
869
|
-
max_length: int | None = None,
|
|
870
|
-
stride: int = 0,
|
|
871
|
-
pad_to_multiple_of: int | None = None,
|
|
872
|
-
padding_side: str | None = None,
|
|
873
|
-
return_tensors: str | None = None,
|
|
874
|
-
return_attention_mask: bool | None = None,
|
|
875
|
-
return_overflowing_tokens: bool = False,
|
|
876
|
-
return_special_tokens_mask: bool = False,
|
|
877
|
-
return_length: bool = False,
|
|
878
|
-
verbose: bool = True,
|
|
879
|
-
) -> BatchEncoding:
|
|
880
|
-
"""
|
|
881
|
-
Prepares a sequence of input id so that it can be used by the model. It
|
|
882
|
-
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
|
|
883
|
-
manages a moving window (with user defined stride) for overflowing tokens.
|
|
884
|
-
|
|
885
|
-
Args:
|
|
886
|
-
batch_ids: list of tokenized input ids
|
|
887
|
-
"""
|
|
888
|
-
|
|
889
|
-
batch_outputs = {}
|
|
890
|
-
for ids in batch_ids:
|
|
891
|
-
outputs = self.prepare_for_model(
|
|
892
|
-
ids,
|
|
893
|
-
add_special_tokens=add_special_tokens,
|
|
894
|
-
padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
|
|
895
|
-
truncation=truncation_strategy.value,
|
|
896
|
-
max_length=max_length,
|
|
897
|
-
stride=stride,
|
|
898
|
-
pad_to_multiple_of=None, # we pad in batch afterward
|
|
899
|
-
padding_side=None, # we pad in batch afterward
|
|
900
|
-
return_attention_mask=False, # we pad in batch afterward
|
|
901
|
-
return_overflowing_tokens=return_overflowing_tokens,
|
|
902
|
-
return_special_tokens_mask=return_special_tokens_mask,
|
|
903
|
-
return_length=return_length,
|
|
904
|
-
return_tensors=None, # We convert the whole batch to tensors at the end
|
|
905
|
-
prepend_batch_axis=False,
|
|
906
|
-
verbose=verbose,
|
|
907
|
-
)
|
|
908
|
-
|
|
909
|
-
for key, value in outputs.items():
|
|
910
|
-
if key not in batch_outputs:
|
|
911
|
-
batch_outputs[key] = []
|
|
912
|
-
batch_outputs[key].append(value)
|
|
913
|
-
|
|
914
|
-
batch_outputs = self.pad(
|
|
915
|
-
batch_outputs,
|
|
916
|
-
padding=padding_strategy.value,
|
|
917
|
-
max_length=max_length,
|
|
918
|
-
pad_to_multiple_of=pad_to_multiple_of,
|
|
919
|
-
padding_side=padding_side,
|
|
920
|
-
return_attention_mask=return_attention_mask,
|
|
921
|
-
)
|
|
922
|
-
|
|
923
|
-
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
|
|
924
|
-
|
|
925
|
-
return batch_outputs
|
|
926
|
-
|
|
927
|
-
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
|
|
928
|
-
def prepare_for_model(
|
|
929
|
-
self,
|
|
930
|
-
ids: list[int],
|
|
931
|
-
pair_ids: None = None,
|
|
932
|
-
add_special_tokens: bool = True,
|
|
933
|
-
padding: bool | str | PaddingStrategy = False,
|
|
934
|
-
truncation: bool | str | TruncationStrategy | None = None,
|
|
935
|
-
max_length: int | None = None,
|
|
936
|
-
stride: int = 0,
|
|
937
|
-
pad_to_multiple_of: int | None = None,
|
|
938
|
-
padding_side: str | None = None,
|
|
939
|
-
return_tensors: str | TensorType | None = None,
|
|
940
|
-
return_attention_mask: bool | None = None,
|
|
941
|
-
return_overflowing_tokens: bool = False,
|
|
942
|
-
return_special_tokens_mask: bool = False,
|
|
943
|
-
return_length: bool = False,
|
|
944
|
-
verbose: bool = True,
|
|
945
|
-
prepend_batch_axis: bool = False,
|
|
946
|
-
**kwargs,
|
|
947
|
-
) -> BatchEncoding:
|
|
948
|
-
"""
|
|
949
|
-
Prepares a sequence of input id so that it can be used by the model. It
|
|
950
|
-
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
|
|
951
|
-
manages a moving window (with user defined stride) for overflowing tokens.
|
|
952
|
-
|
|
953
|
-
Args:
|
|
954
|
-
ids (`list[int]`):
|
|
955
|
-
Tokenized input ids of the first sequence.
|
|
956
|
-
pair_ids (`None`, *optional*):
|
|
957
|
-
Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
|
|
958
|
-
"""
|
|
959
|
-
if pair_ids is not None:
|
|
960
|
-
raise ValueError(
|
|
961
|
-
"`pair_ids` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
|
|
962
|
-
)
|
|
963
|
-
if kwargs:
|
|
964
|
-
raise ValueError(
|
|
965
|
-
f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.prepare_for_model`."
|
|
966
|
-
)
|
|
892
|
+
if kwargs:
|
|
893
|
+
raise ValueError(
|
|
894
|
+
f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.prepare_for_model`."
|
|
895
|
+
)
|
|
967
896
|
|
|
968
897
|
padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
|
|
969
898
|
padding=padding,
|
|
@@ -971,39 +900,65 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
971
900
|
max_length=max_length,
|
|
972
901
|
pad_to_multiple_of=pad_to_multiple_of,
|
|
973
902
|
verbose=verbose,
|
|
903
|
+
**kwargs,
|
|
974
904
|
)
|
|
975
905
|
|
|
976
|
-
|
|
906
|
+
# Validation
|
|
907
|
+
if (
|
|
908
|
+
return_overflowing_tokens
|
|
909
|
+
and truncation_strategy == TruncationStrategy.LONGEST_FIRST
|
|
910
|
+
and pair_ids is not None
|
|
911
|
+
):
|
|
912
|
+
raise ValueError(
|
|
913
|
+
"Not possible to return overflowing tokens for pair of sequences with the "
|
|
914
|
+
"`longest_first`. Please select another truncation strategy than `longest_first`, "
|
|
915
|
+
"for instance `only_second` or `only_first`."
|
|
916
|
+
)
|
|
977
917
|
|
|
978
|
-
#
|
|
918
|
+
# Defaults
|
|
919
|
+
if return_token_type_ids is None:
|
|
920
|
+
return_token_type_ids = "token_type_ids" in self.model_input_names
|
|
979
921
|
if return_attention_mask is None:
|
|
980
922
|
return_attention_mask = "attention_mask" in self.model_input_names
|
|
981
923
|
|
|
982
|
-
|
|
924
|
+
# Truncation
|
|
925
|
+
num_special = self.num_special_tokens_to_add(pair=False) if add_special_tokens else 0
|
|
926
|
+
total_len = len(ids) + len(pair_ids or []) + num_special
|
|
983
927
|
|
|
984
|
-
# Truncation: Handle max sequence length
|
|
985
928
|
overflowing_tokens = []
|
|
986
|
-
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and
|
|
929
|
+
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
|
|
987
930
|
ids, _, overflowing_tokens = self.truncate_sequences(
|
|
988
931
|
ids,
|
|
989
|
-
|
|
932
|
+
pair_ids=None,
|
|
933
|
+
num_tokens_to_remove=total_len - max_length,
|
|
990
934
|
truncation_strategy=truncation_strategy,
|
|
991
935
|
stride=stride,
|
|
992
936
|
)
|
|
993
937
|
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
938
|
+
# Add special tokens
|
|
939
|
+
if add_special_tokens:
|
|
940
|
+
sequence = self.build_inputs_with_special_tokens(ids, None)
|
|
941
|
+
token_type_ids = self.create_token_type_ids_from_sequences(ids, None)
|
|
942
|
+
else:
|
|
943
|
+
sequence = ids
|
|
944
|
+
token_type_ids = [0] * len(sequence)
|
|
997
945
|
|
|
998
|
-
# Build output
|
|
999
|
-
encoded_inputs
|
|
946
|
+
# Build output
|
|
947
|
+
encoded_inputs = {"input_ids": sequence}
|
|
948
|
+
if return_token_type_ids:
|
|
949
|
+
encoded_inputs["token_type_ids"] = token_type_ids
|
|
1000
950
|
if return_special_tokens_mask:
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
951
|
+
encoded_inputs["special_tokens_mask"] = (
|
|
952
|
+
self.get_special_tokens_mask(ids, None) if add_special_tokens else [0] * len(sequence)
|
|
953
|
+
)
|
|
954
|
+
if return_overflowing_tokens and not return_tensors and overflowing_tokens:
|
|
955
|
+
encoded_inputs["overflowing_tokens"] = overflowing_tokens
|
|
956
|
+
encoded_inputs["num_truncated_tokens"] = total_len - max_length if max_length else 0
|
|
957
|
+
|
|
958
|
+
# Check sequence length and warn if needed
|
|
959
|
+
self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
|
|
1005
960
|
|
|
1006
|
-
#
|
|
961
|
+
# Pad
|
|
1007
962
|
if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
|
|
1008
963
|
encoded_inputs = self.pad(
|
|
1009
964
|
encoded_inputs,
|
|
@@ -1017,362 +972,9 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1017
972
|
if return_length:
|
|
1018
973
|
encoded_inputs["length"] = len(encoded_inputs["input_ids"])
|
|
1019
974
|
|
|
1020
|
-
|
|
1021
|
-
encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
|
|
1022
|
-
)
|
|
1023
|
-
|
|
1024
|
-
return batch_outputs
|
|
1025
|
-
|
|
1026
|
-
def _get_padding_truncation_strategies(
|
|
1027
|
-
self,
|
|
1028
|
-
padding: str | PaddingStrategy | bool = False,
|
|
1029
|
-
truncation: str | TruncationStrategy | bool | None = None,
|
|
1030
|
-
max_length: int | None = None,
|
|
1031
|
-
pad_to_multiple_of: int | None = None,
|
|
1032
|
-
verbose: bool = True,
|
|
1033
|
-
**kwargs,
|
|
1034
|
-
):
|
|
1035
|
-
"""
|
|
1036
|
-
Find the correct padding/truncation strategy.
|
|
1037
|
-
"""
|
|
1038
|
-
|
|
1039
|
-
# Backward compatibility for previous behavior, maybe we should deprecate it:
|
|
1040
|
-
# If you only set max_length, it activates truncation for max_length
|
|
1041
|
-
if max_length is not None and padding is False and truncation is None:
|
|
1042
|
-
if verbose:
|
|
1043
|
-
if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
|
|
1044
|
-
logger.warning(
|
|
1045
|
-
"Truncation was not explicitly activated but `max_length` is provided a specific value, please"
|
|
1046
|
-
" use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
|
|
1047
|
-
" 'longest_first' truncation strategy."
|
|
1048
|
-
)
|
|
1049
|
-
self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
|
|
1050
|
-
truncation = "longest_first"
|
|
1051
|
-
|
|
1052
|
-
# Get padding strategy
|
|
1053
|
-
if padding is not False:
|
|
1054
|
-
if padding is True:
|
|
1055
|
-
if verbose:
|
|
1056
|
-
if max_length is not None and (
|
|
1057
|
-
truncation is None or truncation is False or truncation == "do_not_truncate"
|
|
1058
|
-
):
|
|
1059
|
-
warnings.warn(
|
|
1060
|
-
"`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
|
|
1061
|
-
"To pad to max length, use `padding='max_length'`."
|
|
1062
|
-
)
|
|
1063
|
-
padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
|
|
1064
|
-
elif not isinstance(padding, PaddingStrategy):
|
|
1065
|
-
padding_strategy = PaddingStrategy(padding)
|
|
1066
|
-
elif isinstance(padding, PaddingStrategy):
|
|
1067
|
-
padding_strategy = padding
|
|
1068
|
-
else:
|
|
1069
|
-
padding_strategy = PaddingStrategy.DO_NOT_PAD
|
|
1070
|
-
|
|
1071
|
-
# Get truncation strategy
|
|
1072
|
-
if truncation is not False and truncation is not None:
|
|
1073
|
-
if truncation is True:
|
|
1074
|
-
truncation_strategy = (
|
|
1075
|
-
TruncationStrategy.LONGEST_FIRST
|
|
1076
|
-
) # Default to truncate the longest sequences in pairs of inputs
|
|
1077
|
-
elif not isinstance(truncation, TruncationStrategy):
|
|
1078
|
-
truncation_strategy = TruncationStrategy(truncation)
|
|
1079
|
-
elif isinstance(truncation, TruncationStrategy):
|
|
1080
|
-
truncation_strategy = truncation
|
|
1081
|
-
if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]:
|
|
1082
|
-
raise ValueError(
|
|
1083
|
-
"Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
|
|
1084
|
-
)
|
|
1085
|
-
else:
|
|
1086
|
-
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
|
|
1087
|
-
|
|
1088
|
-
# Set max length if needed
|
|
1089
|
-
if max_length is None:
|
|
1090
|
-
if padding_strategy == PaddingStrategy.MAX_LENGTH:
|
|
1091
|
-
if self.model_max_length > LARGE_INTEGER:
|
|
1092
|
-
if verbose:
|
|
1093
|
-
if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
|
|
1094
|
-
logger.warning(
|
|
1095
|
-
"Asking to pad to max_length but no maximum length is provided and the model has no"
|
|
1096
|
-
" predefined maximum length. Default to no padding."
|
|
1097
|
-
)
|
|
1098
|
-
self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
|
|
1099
|
-
padding_strategy = PaddingStrategy.DO_NOT_PAD
|
|
1100
|
-
else:
|
|
1101
|
-
max_length = self.model_max_length
|
|
1102
|
-
|
|
1103
|
-
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
|
|
1104
|
-
if self.model_max_length > LARGE_INTEGER:
|
|
1105
|
-
if verbose:
|
|
1106
|
-
if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
|
|
1107
|
-
logger.warning(
|
|
1108
|
-
"Asking to truncate to max_length but no maximum length is provided and the model has"
|
|
1109
|
-
" no predefined maximum length. Default to no truncation."
|
|
1110
|
-
)
|
|
1111
|
-
self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
|
|
1112
|
-
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
|
|
1113
|
-
else:
|
|
1114
|
-
max_length = self.model_max_length
|
|
1115
|
-
|
|
1116
|
-
# Test if we have a padding token
|
|
1117
|
-
if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token_id is None or self.pad_token_id < 0):
|
|
1118
|
-
raise ValueError(
|
|
1119
|
-
"Asking to pad but the tokenizer does not have a padding token. "
|
|
1120
|
-
"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
|
|
1121
|
-
"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
|
|
1122
|
-
)
|
|
1123
|
-
|
|
1124
|
-
# Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
|
|
1125
|
-
if (
|
|
1126
|
-
truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
|
|
1127
|
-
and padding_strategy != PaddingStrategy.DO_NOT_PAD
|
|
1128
|
-
and pad_to_multiple_of is not None
|
|
1129
|
-
and max_length is not None
|
|
1130
|
-
and (max_length % pad_to_multiple_of != 0)
|
|
1131
|
-
):
|
|
1132
|
-
raise ValueError(
|
|
1133
|
-
"Truncation and padding are both activated but "
|
|
1134
|
-
f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
|
|
1135
|
-
)
|
|
1136
|
-
|
|
1137
|
-
return padding_strategy, truncation_strategy, max_length, kwargs
|
|
1138
|
-
|
|
1139
|
-
def _pad(
|
|
1140
|
-
self,
|
|
1141
|
-
encoded_inputs: dict[str, EncodedInput] | BatchEncoding,
|
|
1142
|
-
max_length: int | None = None,
|
|
1143
|
-
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
|
1144
|
-
pad_to_multiple_of: int | None = None,
|
|
1145
|
-
padding_side: str | None = None,
|
|
1146
|
-
return_attention_mask: bool | None = None,
|
|
1147
|
-
) -> dict:
|
|
1148
|
-
"""
|
|
1149
|
-
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
|
|
1150
|
-
|
|
1151
|
-
Args:
|
|
1152
|
-
encoded_inputs:
|
|
1153
|
-
Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
|
|
1154
|
-
max_length: maximum length of the returned list and optionally padding length (see below).
|
|
1155
|
-
Will truncate by taking into account the special tokens.
|
|
1156
|
-
padding_strategy: PaddingStrategy to use for padding.
|
|
1157
|
-
|
|
1158
|
-
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
|
1159
|
-
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
|
1160
|
-
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
|
1161
|
-
The tokenizer padding sides are defined in `padding_side` argument:
|
|
1162
|
-
|
|
1163
|
-
- 'left': pads on the left of the sequences
|
|
1164
|
-
- 'right': pads on the right of the sequences
|
|
1165
|
-
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
|
1166
|
-
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
|
1167
|
-
`>= 7.5` (Volta).
|
|
1168
|
-
padding_side:
|
|
1169
|
-
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
|
1170
|
-
Default value is picked from the class attribute of the same name.
|
|
1171
|
-
return_attention_mask:
|
|
1172
|
-
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
|
1173
|
-
"""
|
|
1174
|
-
# Load from model defaults
|
|
1175
|
-
if return_attention_mask is None:
|
|
1176
|
-
return_attention_mask = "attention_mask" in self.model_input_names
|
|
1177
|
-
|
|
1178
|
-
required_input = encoded_inputs[self.model_input_names[0]]
|
|
1179
|
-
|
|
1180
|
-
if padding_strategy == PaddingStrategy.LONGEST:
|
|
1181
|
-
max_length = len(required_input)
|
|
1182
|
-
|
|
1183
|
-
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
|
|
1184
|
-
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
|
|
1185
|
-
|
|
1186
|
-
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
|
|
1187
|
-
|
|
1188
|
-
# Initialize attention mask if not present.
|
|
1189
|
-
if return_attention_mask and "attention_mask" not in encoded_inputs:
|
|
1190
|
-
encoded_inputs["attention_mask"] = [1] * len(required_input)
|
|
1191
|
-
|
|
1192
|
-
if needs_to_be_padded:
|
|
1193
|
-
difference = max_length - len(required_input)
|
|
1194
|
-
padding_side = padding_side if padding_side is not None else self.padding_side
|
|
1195
|
-
|
|
1196
|
-
if padding_side == "right":
|
|
1197
|
-
if return_attention_mask:
|
|
1198
|
-
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
|
1199
|
-
if "special_tokens_mask" in encoded_inputs:
|
|
1200
|
-
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
|
1201
|
-
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
|
1202
|
-
elif padding_side == "left":
|
|
1203
|
-
if return_attention_mask:
|
|
1204
|
-
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
|
1205
|
-
if "special_tokens_mask" in encoded_inputs:
|
|
1206
|
-
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
|
1207
|
-
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
|
1208
|
-
else:
|
|
1209
|
-
raise ValueError(f"Invalid padding strategy:{padding_side}")
|
|
1210
|
-
|
|
1211
|
-
return encoded_inputs
|
|
975
|
+
return BatchEncoding(encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis)
|
|
1212
976
|
|
|
1213
|
-
def
|
|
1214
|
-
self,
|
|
1215
|
-
encoded_inputs: BatchEncoding
|
|
1216
|
-
| list[BatchEncoding]
|
|
1217
|
-
| dict[str, EncodedInput]
|
|
1218
|
-
| dict[str, list[EncodedInput]]
|
|
1219
|
-
| list[dict[str, EncodedInput]],
|
|
1220
|
-
padding: bool | str | PaddingStrategy = True,
|
|
1221
|
-
max_length: int | None = None,
|
|
1222
|
-
pad_to_multiple_of: int | None = None,
|
|
1223
|
-
padding_side: str | None = None,
|
|
1224
|
-
return_attention_mask: bool | None = None,
|
|
1225
|
-
return_tensors: str | TensorType | None = None,
|
|
1226
|
-
verbose: bool = True,
|
|
1227
|
-
) -> BatchEncoding:
|
|
1228
|
-
"""
|
|
1229
|
-
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
|
|
1230
|
-
in the batch.
|
|
1231
|
-
|
|
1232
|
-
Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
|
|
1233
|
-
`self.pad_token_id`).
|
|
1234
|
-
<Tip>
|
|
1235
|
-
|
|
1236
|
-
If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors, the
|
|
1237
|
-
result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
|
|
1238
|
-
PyTorch tensors, you will lose the specific device of your tensors however.
|
|
1239
|
-
|
|
1240
|
-
</Tip>
|
|
1241
|
-
|
|
1242
|
-
Args:
|
|
1243
|
-
encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, list[int]]`, `Dict[str, list[list[int]]` or `List[Dict[str, list[int]]]`):
|
|
1244
|
-
Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, list[int]]`) or a batch of
|
|
1245
|
-
tokenized inputs (list of [`BatchEncoding`], *Dict[str, list[list[int]]]* or *List[Dict[str,
|
|
1246
|
-
list[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
|
|
1247
|
-
collate function.
|
|
1248
|
-
|
|
1249
|
-
Instead of `list[int]` you can have tensors (numpy arrays, PyTorch tensors), see
|
|
1250
|
-
the note above for the return type.
|
|
1251
|
-
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
|
|
1252
|
-
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
|
1253
|
-
index) among:
|
|
1254
|
-
|
|
1255
|
-
- `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
|
|
1256
|
-
sequence if provided).
|
|
1257
|
-
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
|
|
1258
|
-
acceptable input length for the model if that argument is not provided.
|
|
1259
|
-
- `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different
|
|
1260
|
-
lengths).
|
|
1261
|
-
max_length (`int`, *optional*):
|
|
1262
|
-
Maximum length of the returned list and optionally padding length (see above).
|
|
1263
|
-
pad_to_multiple_of (`int`, *optional*):
|
|
1264
|
-
If set will pad the sequence to a multiple of the provided value.
|
|
1265
|
-
|
|
1266
|
-
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
|
1267
|
-
`>= 7.5` (Volta).
|
|
1268
|
-
padding_side (`str`, *optional*):
|
|
1269
|
-
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
|
|
1270
|
-
Default value is picked from the class attribute of the same name.
|
|
1271
|
-
return_attention_mask (`bool`, *optional*):
|
|
1272
|
-
Whether to return the attention mask. If left to the default, will return the attention mask according
|
|
1273
|
-
to the specific tokenizer's default, defined by the `return_outputs` attribute.
|
|
1274
|
-
|
|
1275
|
-
[What are attention masks?](../glossary#attention-mask)
|
|
1276
|
-
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
1277
|
-
If set, will return tensors instead of list of python integers. Acceptable values are:
|
|
1278
|
-
|
|
1279
|
-
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
|
1280
|
-
- `'np'`: Return Numpy `np.ndarray` objects.
|
|
1281
|
-
verbose (`bool`, *optional*, defaults to `True`):
|
|
1282
|
-
Whether or not to print more information and warnings.
|
|
1283
|
-
"""
|
|
1284
|
-
# If we have a list of dicts, let's convert it in a dict of lists
|
|
1285
|
-
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
|
|
1286
|
-
if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
|
|
1287
|
-
# Call .keys() explicitly for compatibility with TensorDict and other Mapping subclasses
|
|
1288
|
-
encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
|
|
1289
|
-
|
|
1290
|
-
# The model's main input name, usually `input_ids`, has been passed for padding
|
|
1291
|
-
if self.model_input_names[0] not in encoded_inputs:
|
|
1292
|
-
raise ValueError(
|
|
1293
|
-
"You should supply an encoding or a list of encodings to this method "
|
|
1294
|
-
f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
|
|
1295
|
-
)
|
|
1296
|
-
|
|
1297
|
-
required_input = encoded_inputs[self.model_input_names[0]]
|
|
1298
|
-
|
|
1299
|
-
if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
|
|
1300
|
-
if return_attention_mask:
|
|
1301
|
-
encoded_inputs["attention_mask"] = []
|
|
1302
|
-
return encoded_inputs
|
|
1303
|
-
|
|
1304
|
-
# If we have PyTorch/NumPy tensors/arrays as inputs, we cast them as python objects
|
|
1305
|
-
# and rebuild them afterwards if no return_tensors is specified
|
|
1306
|
-
# Note that we lose the specific device the tensor may be on for PyTorch
|
|
1307
|
-
|
|
1308
|
-
first_element = required_input[0]
|
|
1309
|
-
if isinstance(first_element, (list, tuple)):
|
|
1310
|
-
# first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
|
|
1311
|
-
for item in required_input:
|
|
1312
|
-
if len(item) != 0:
|
|
1313
|
-
first_element = item[0]
|
|
1314
|
-
break
|
|
1315
|
-
# At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
|
|
1316
|
-
if not isinstance(first_element, (int, list, tuple)):
|
|
1317
|
-
if is_torch_tensor(first_element):
|
|
1318
|
-
return_tensors = "pt" if return_tensors is None else return_tensors
|
|
1319
|
-
elif isinstance(first_element, np.ndarray):
|
|
1320
|
-
return_tensors = "np" if return_tensors is None else return_tensors
|
|
1321
|
-
else:
|
|
1322
|
-
raise ValueError(
|
|
1323
|
-
f"type of {first_element} unknown: {type(first_element)}. "
|
|
1324
|
-
"Should be one of a python, numpy, or pytorch object."
|
|
1325
|
-
)
|
|
1326
|
-
|
|
1327
|
-
for key, value in encoded_inputs.items():
|
|
1328
|
-
encoded_inputs[key] = to_py_obj(value)
|
|
1329
|
-
|
|
1330
|
-
# Convert padding_strategy in PaddingStrategy
|
|
1331
|
-
padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
|
|
1332
|
-
padding=padding, max_length=max_length, verbose=verbose
|
|
1333
|
-
)
|
|
1334
|
-
|
|
1335
|
-
required_input = encoded_inputs[self.model_input_names[0]]
|
|
1336
|
-
if required_input and not isinstance(required_input[0], (list, tuple)):
|
|
1337
|
-
encoded_inputs = self._pad(
|
|
1338
|
-
encoded_inputs,
|
|
1339
|
-
max_length=max_length,
|
|
1340
|
-
padding_strategy=padding_strategy,
|
|
1341
|
-
pad_to_multiple_of=pad_to_multiple_of,
|
|
1342
|
-
padding_side=padding_side,
|
|
1343
|
-
return_attention_mask=return_attention_mask,
|
|
1344
|
-
)
|
|
1345
|
-
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
|
|
1346
|
-
|
|
1347
|
-
batch_size = len(required_input)
|
|
1348
|
-
assert all(len(v) == batch_size for v in encoded_inputs.values()), (
|
|
1349
|
-
"Some items in the output dictionary have a different batch size than others."
|
|
1350
|
-
)
|
|
1351
|
-
|
|
1352
|
-
if padding_strategy == PaddingStrategy.LONGEST:
|
|
1353
|
-
max_length = max(len(inputs) for inputs in required_input)
|
|
1354
|
-
padding_strategy = PaddingStrategy.MAX_LENGTH
|
|
1355
|
-
|
|
1356
|
-
batch_outputs = {}
|
|
1357
|
-
for i in range(batch_size):
|
|
1358
|
-
inputs = {k: v[i] for k, v in encoded_inputs.items()}
|
|
1359
|
-
outputs = self._pad(
|
|
1360
|
-
inputs,
|
|
1361
|
-
max_length=max_length,
|
|
1362
|
-
padding_strategy=padding_strategy,
|
|
1363
|
-
pad_to_multiple_of=pad_to_multiple_of,
|
|
1364
|
-
padding_side=padding_side,
|
|
1365
|
-
return_attention_mask=return_attention_mask,
|
|
1366
|
-
)
|
|
1367
|
-
|
|
1368
|
-
for key, value in outputs.items():
|
|
1369
|
-
if key not in batch_outputs:
|
|
1370
|
-
batch_outputs[key] = []
|
|
1371
|
-
batch_outputs[key].append(value)
|
|
1372
|
-
|
|
1373
|
-
return BatchEncoding(batch_outputs, tensor_type=return_tensors)
|
|
1374
|
-
|
|
1375
|
-
def truncate_sequences(
|
|
977
|
+
def truncate_sequences( # type: ignore[override]
|
|
1376
978
|
self,
|
|
1377
979
|
ids: list[int],
|
|
1378
980
|
pair_ids: None = None,
|
|
@@ -1407,47 +1009,36 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1407
1009
|
`Tuple[list[int], None, list[int]]`: The truncated `ids` and the list of
|
|
1408
1010
|
overflowing tokens. `None` is returned to match Transformers signature.
|
|
1409
1011
|
"""
|
|
1410
|
-
|
|
1411
|
-
raise ValueError(
|
|
1412
|
-
f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.truncate_sequences`."
|
|
1413
|
-
)
|
|
1012
|
+
|
|
1414
1013
|
if pair_ids:
|
|
1415
1014
|
raise ValueError("`pair_ids` is not supported by `MistralCommonBackend.truncate_sequences`.")
|
|
1416
1015
|
|
|
1417
|
-
if num_tokens_to_remove <= 0:
|
|
1418
|
-
return (ids, None, [])
|
|
1419
|
-
|
|
1420
1016
|
if not isinstance(truncation_strategy, TruncationStrategy):
|
|
1421
1017
|
truncation_strategy = TruncationStrategy(truncation_strategy)
|
|
1422
1018
|
|
|
1423
|
-
if truncation_strategy in [
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1019
|
+
if truncation_strategy in [
|
|
1020
|
+
TruncationStrategy.ONLY_FIRST,
|
|
1021
|
+
TruncationStrategy.ONLY_SECOND,
|
|
1022
|
+
]:
|
|
1023
|
+
raise ValueError(f"{truncation_strategy=} is not supported by `MistralCommonBackend`.")
|
|
1024
|
+
|
|
1025
|
+
if num_tokens_to_remove <= 0:
|
|
1026
|
+
return ids, None, []
|
|
1427
1027
|
|
|
1428
1028
|
overflowing_tokens = []
|
|
1429
|
-
if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
|
|
1430
|
-
if len(ids) > num_tokens_to_remove:
|
|
1431
|
-
window_len = min(len(ids), stride + num_tokens_to_remove)
|
|
1432
|
-
if self.truncation_side == "left":
|
|
1433
|
-
overflowing_tokens = ids[:window_len]
|
|
1434
|
-
ids = ids[num_tokens_to_remove:]
|
|
1435
|
-
elif self.truncation_side == "right":
|
|
1436
|
-
overflowing_tokens = ids[-window_len:]
|
|
1437
|
-
ids = ids[:-num_tokens_to_remove]
|
|
1438
|
-
else:
|
|
1439
|
-
raise ValueError(f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.")
|
|
1440
1029
|
|
|
1030
|
+
if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
|
|
1031
|
+
window_len = min(len(ids), stride + num_tokens_to_remove)
|
|
1032
|
+
if self.truncation_side == "left":
|
|
1033
|
+
overflowing_tokens = ids[:window_len]
|
|
1034
|
+
ids = ids[num_tokens_to_remove:]
|
|
1441
1035
|
else:
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
f"but the first sequence has a length {len(ids)}. "
|
|
1445
|
-
)
|
|
1446
|
-
logger.error(error_msg)
|
|
1036
|
+
overflowing_tokens = ids[-window_len:]
|
|
1037
|
+
ids = ids[:-num_tokens_to_remove]
|
|
1447
1038
|
|
|
1448
|
-
return
|
|
1039
|
+
return ids, None, overflowing_tokens
|
|
1449
1040
|
|
|
1450
|
-
def apply_chat_template(
|
|
1041
|
+
def apply_chat_template( # type: ignore[override]
|
|
1451
1042
|
self,
|
|
1452
1043
|
conversation: list[dict[str, str]] | list[list[dict[str, str]]],
|
|
1453
1044
|
tools: list[dict | Callable] | None = None,
|
|
@@ -1475,8 +1066,8 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1475
1066
|
[chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
|
|
1476
1067
|
for more information.
|
|
1477
1068
|
add_generation_prompt (`bool`, *optional*):
|
|
1478
|
-
This argument is a no-op for `MistralCommonBackend`. However it cannot be used at the same time as `continue_final_message` to keep the API consistent
|
|
1479
|
-
|
|
1069
|
+
This argument is a no-op for `MistralCommonBackend`. However, it cannot be used at the same time as `continue_final_message` to keep the API consistent.
|
|
1070
|
+
If any conversation ends with an assistant message, it will raise an error. In such cases, use `continue_final_message` instead.
|
|
1480
1071
|
continue_final_message (bool, *optional*):
|
|
1481
1072
|
If this is set, the chat will be formatted so that the final
|
|
1482
1073
|
message in the chat is open-ended, without any EOS tokens. The model will continue this message
|
|
@@ -1511,8 +1102,7 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1511
1102
|
Will raise an error if used.
|
|
1512
1103
|
|
|
1513
1104
|
Returns:
|
|
1514
|
-
`Union[str, list[int], list[str], list[list[int]], BatchEncoding]`:
|
|
1515
|
-
tokens. This output is ready to pass to the model, either directly or via methods like `generate()`.
|
|
1105
|
+
`Union[str, list[int], list[str], list[list[int]], BatchEncoding]`: The tokenized chat so far, including control tokens. This output is ready to pass to the model, either directly or via methods like `generate()`.
|
|
1516
1106
|
"""
|
|
1517
1107
|
if kwargs:
|
|
1518
1108
|
raise ValueError(
|
|
@@ -1659,6 +1249,83 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1659
1249
|
)
|
|
1660
1250
|
return outputs
|
|
1661
1251
|
|
|
1252
|
+
def build_inputs_with_special_tokens(self, token_ids_0: list[int], token_ids_1: None = None) -> list[int]:
|
|
1253
|
+
"""
|
|
1254
|
+
Build model inputs from a sequence by adding special tokens.
|
|
1255
|
+
|
|
1256
|
+
This method dynamically builds inputs based on the tokenizer's `mode`:
|
|
1257
|
+
- `"test"`: seq0 [EOS]
|
|
1258
|
+
- `"finetuning"`: [BOS] seq0
|
|
1259
|
+
|
|
1260
|
+
Args:
|
|
1261
|
+
token_ids_0 (`list[int]`):
|
|
1262
|
+
List of IDs to which the special tokens will be added.
|
|
1263
|
+
token_ids_1 (`None`, *optional*): None, kept to match Transformers' signature.
|
|
1264
|
+
|
|
1265
|
+
Returns:
|
|
1266
|
+
`list[int]`: List of input IDs with the appropriate special tokens.
|
|
1267
|
+
"""
|
|
1268
|
+
if token_ids_1 is not None:
|
|
1269
|
+
raise ValueError(
|
|
1270
|
+
"`MistralCommonBackend` does not implement `token_ids_1 != None` for `build_inputs_with_special_tokens`."
|
|
1271
|
+
)
|
|
1272
|
+
|
|
1273
|
+
if self.mode == ValidationMode.test:
|
|
1274
|
+
# [BOS] seq0
|
|
1275
|
+
return [self.bos_token_id] + token_ids_0
|
|
1276
|
+
|
|
1277
|
+
else:
|
|
1278
|
+
# [BOS] seq0 [EOS]
|
|
1279
|
+
return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
|
|
1280
|
+
|
|
1281
|
+
def create_token_type_ids_from_sequences(self, token_ids_0: list[int], token_ids_1: None = None) -> list[int]:
|
|
1282
|
+
"""
|
|
1283
|
+
Create a mask of zeroes from the token ids with special tokens added.
|
|
1284
|
+
|
|
1285
|
+
Kept to match Transformers' implementation.
|
|
1286
|
+
|
|
1287
|
+
Args:
|
|
1288
|
+
token_ids_0 (`list[int]`):
|
|
1289
|
+
List of IDs.
|
|
1290
|
+
token_ids_1 (`None`, *optional*): None, kept to match Transformers' signature.
|
|
1291
|
+
|
|
1292
|
+
|
|
1293
|
+
Returns:
|
|
1294
|
+
`list[int]`: Token type IDs according to the configured pattern.
|
|
1295
|
+
"""
|
|
1296
|
+
if token_ids_1 is not None:
|
|
1297
|
+
raise ValueError(
|
|
1298
|
+
"`MistralCommonBackend` does not implement `token_ids_1 != None` for `create_token_type_ids_from_sequences`."
|
|
1299
|
+
)
|
|
1300
|
+
|
|
1301
|
+
sequence = self.build_inputs_with_special_tokens(token_ids_0)
|
|
1302
|
+
|
|
1303
|
+
return [0] * len(sequence)
|
|
1304
|
+
|
|
1305
|
+
def num_special_tokens_to_add(self, pair: Literal[False] = False) -> int:
|
|
1306
|
+
"""
|
|
1307
|
+
Returns the number of added tokens when encoding a sequence with special tokens.
|
|
1308
|
+
|
|
1309
|
+
<Tip>
|
|
1310
|
+
|
|
1311
|
+
This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
|
|
1312
|
+
this inside your training loop.
|
|
1313
|
+
|
|
1314
|
+
</Tip>
|
|
1315
|
+
|
|
1316
|
+
Args:
|
|
1317
|
+
pair (`Literal[False]`, *optional*): False, kept to match Transformer's signature.
|
|
1318
|
+
|
|
1319
|
+
Returns:
|
|
1320
|
+
`int`: Number of special tokens added to sequences.
|
|
1321
|
+
"""
|
|
1322
|
+
if pair:
|
|
1323
|
+
raise ValueError(
|
|
1324
|
+
"`MistralCommonBackend` does not implement `pair = True` for `num_special_tokens_to_add`."
|
|
1325
|
+
)
|
|
1326
|
+
|
|
1327
|
+
return len(self.build_inputs_with_special_tokens([], None))
|
|
1328
|
+
|
|
1662
1329
|
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
|
|
1663
1330
|
def __call__(
|
|
1664
1331
|
self,
|
|
@@ -1679,6 +1346,8 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1679
1346
|
return_special_tokens_mask: bool = False,
|
|
1680
1347
|
return_length: bool = False,
|
|
1681
1348
|
verbose: bool = True,
|
|
1349
|
+
return_offsets_mapping: Literal[False] = False,
|
|
1350
|
+
split_special_tokens: Literal[False] = False,
|
|
1682
1351
|
**kwargs,
|
|
1683
1352
|
) -> BatchEncoding:
|
|
1684
1353
|
"""
|
|
@@ -1696,92 +1365,49 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1696
1365
|
text_pair_target (`None`, *optional*):
|
|
1697
1366
|
Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
|
|
1698
1367
|
"""
|
|
1699
|
-
if
|
|
1700
|
-
raise ValueError(
|
|
1368
|
+
if return_offsets_mapping or split_special_tokens:
|
|
1369
|
+
raise ValueError(
|
|
1370
|
+
"`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
|
|
1371
|
+
)
|
|
1701
1372
|
|
|
1702
|
-
if
|
|
1373
|
+
if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND, "only_first", "only_second"]:
|
|
1703
1374
|
raise ValueError(
|
|
1704
|
-
"
|
|
1375
|
+
"Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
|
|
1705
1376
|
)
|
|
1706
1377
|
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
# Strings are fine
|
|
1710
|
-
return True
|
|
1711
|
-
elif isinstance(t, (list, tuple)):
|
|
1712
|
-
# List are fine as long as they are...
|
|
1713
|
-
if len(t) == 0:
|
|
1714
|
-
# ... empty
|
|
1715
|
-
return True
|
|
1716
|
-
elif isinstance(t[0], (str, int)):
|
|
1717
|
-
# ... list of strings or int
|
|
1718
|
-
return True
|
|
1719
|
-
elif isinstance(t[0], (list, tuple)):
|
|
1720
|
-
# ... list with an empty list or with a list of strings or with a list of ints
|
|
1721
|
-
return len(t[0]) == 0 or isinstance(t[0][0], (str, int))
|
|
1722
|
-
else:
|
|
1723
|
-
return False
|
|
1724
|
-
else:
|
|
1725
|
-
return False
|
|
1378
|
+
if kwargs:
|
|
1379
|
+
raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.__call__`.")
|
|
1726
1380
|
|
|
1727
|
-
if
|
|
1381
|
+
if text_pair or text_target or text_pair_target:
|
|
1728
1382
|
raise ValueError(
|
|
1729
|
-
"
|
|
1730
|
-
"or `list[list[int]]` (batch of encoded examples)."
|
|
1383
|
+
"`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonBackend`."
|
|
1731
1384
|
)
|
|
1732
1385
|
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1386
|
+
return super().__call__(
|
|
1387
|
+
text=text,
|
|
1388
|
+
text_pair=text_pair,
|
|
1389
|
+
text_target=text_target,
|
|
1390
|
+
add_special_tokens=add_special_tokens,
|
|
1736
1391
|
padding=padding,
|
|
1737
1392
|
truncation=truncation,
|
|
1738
1393
|
max_length=max_length,
|
|
1394
|
+
stride=stride,
|
|
1739
1395
|
pad_to_multiple_of=pad_to_multiple_of,
|
|
1396
|
+
padding_side=padding_side,
|
|
1397
|
+
return_tensors=return_tensors,
|
|
1398
|
+
return_attention_mask=return_attention_mask,
|
|
1399
|
+
return_overflowing_tokens=return_overflowing_tokens,
|
|
1400
|
+
return_special_tokens_mask=return_special_tokens_mask,
|
|
1401
|
+
return_length=return_length,
|
|
1740
1402
|
verbose=verbose,
|
|
1741
|
-
**kwargs,
|
|
1742
1403
|
)
|
|
1743
1404
|
|
|
1744
|
-
if is_batched:
|
|
1745
|
-
return self._batch_encode_plus(
|
|
1746
|
-
batch_text=text,
|
|
1747
|
-
add_special_tokens=add_special_tokens,
|
|
1748
|
-
padding_strategy=padding_strategy,
|
|
1749
|
-
truncation_strategy=truncation_strategy,
|
|
1750
|
-
max_length=max_length,
|
|
1751
|
-
stride=stride,
|
|
1752
|
-
pad_to_multiple_of=pad_to_multiple_of,
|
|
1753
|
-
padding_side=padding_side,
|
|
1754
|
-
return_tensors=return_tensors,
|
|
1755
|
-
return_attention_mask=return_attention_mask,
|
|
1756
|
-
return_overflowing_tokens=return_overflowing_tokens,
|
|
1757
|
-
return_special_tokens_mask=return_special_tokens_mask,
|
|
1758
|
-
return_length=return_length,
|
|
1759
|
-
verbose=verbose,
|
|
1760
|
-
)
|
|
1761
|
-
else:
|
|
1762
|
-
return self._encode_plus(
|
|
1763
|
-
text=text,
|
|
1764
|
-
add_special_tokens=add_special_tokens,
|
|
1765
|
-
padding_strategy=padding_strategy,
|
|
1766
|
-
truncation_strategy=truncation_strategy,
|
|
1767
|
-
max_length=max_length,
|
|
1768
|
-
stride=stride,
|
|
1769
|
-
pad_to_multiple_of=pad_to_multiple_of,
|
|
1770
|
-
padding_side=padding_side,
|
|
1771
|
-
return_tensors=return_tensors,
|
|
1772
|
-
return_attention_mask=return_attention_mask,
|
|
1773
|
-
return_overflowing_tokens=return_overflowing_tokens,
|
|
1774
|
-
return_special_tokens_mask=return_special_tokens_mask,
|
|
1775
|
-
return_length=return_length,
|
|
1776
|
-
verbose=verbose,
|
|
1777
|
-
)
|
|
1778
|
-
|
|
1779
1405
|
@classmethod
|
|
1780
1406
|
def from_pretrained(
|
|
1781
1407
|
cls,
|
|
1782
1408
|
pretrained_model_name_or_path: str | os.PathLike,
|
|
1783
1409
|
*init_inputs,
|
|
1784
|
-
mode:
|
|
1410
|
+
mode: str | ValidationMode = ValidationMode.test,
|
|
1785
1411
|
cache_dir: str | os.PathLike | None = None,
|
|
1786
1412
|
force_download: bool = False,
|
|
1787
1413
|
local_files_only: bool = False,
|
|
@@ -1808,9 +1434,9 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1808
1434
|
`./my_model_directory/`.
|
|
1809
1435
|
mode (`Union[str, ValidationMode]`, *optional*, defaults to `ValidationMode.test`):
|
|
1810
1436
|
Validation mode for the `MistralTokenizer` tokenizer. Possible values are:
|
|
1811
|
-
- `"finetuning"` or `ValidationMode.finetuning`: The
|
|
1437
|
+
- `"finetuning"` or `ValidationMode.finetuning`: The fine-tuning mode.
|
|
1812
1438
|
- `"test"` or `ValidationMode.test`: The test mode.
|
|
1813
|
-
It changes how the tokenizer validates the input and
|
|
1439
|
+
It changes how the tokenizer validates the input and prepares the request to the model.
|
|
1814
1440
|
cache_dir (`str` or `os.PathLike`, *optional*):
|
|
1815
1441
|
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
|
|
1816
1442
|
standard cache should not be used.
|
|
@@ -1837,11 +1463,11 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1837
1463
|
Default value is picked from the class attribute of the same name.
|
|
1838
1464
|
truncation_side (`str`, *optional*, defaults to `"right"`):
|
|
1839
1465
|
The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
|
|
1840
|
-
model_input_names (`List[
|
|
1466
|
+
model_input_names (`List[str]`, *optional*):
|
|
1841
1467
|
The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
|
|
1842
1468
|
`"attention_mask"`). Default value is picked from the class attribute of the same name.
|
|
1843
1469
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
|
|
1844
|
-
Whether or not the model should
|
|
1470
|
+
Whether or not the model should clean up the spaces that were added when splitting the input text during the
|
|
1845
1471
|
tokenization process.
|
|
1846
1472
|
kwargs (additional keyword arguments, *optional*):
|
|
1847
1473
|
Not supported by `MistralCommonBackend.from_pretrained`.
|
|
@@ -1851,11 +1477,13 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1851
1477
|
raise ValueError("`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`.")
|
|
1852
1478
|
|
|
1853
1479
|
# Handle kwargs and AutoTokenizer/AutoProcessor case
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
):
|
|
1858
|
-
raise ValueError(
|
|
1480
|
+
valid_kwargs = _VALID_INIT_KWARGS.union(
|
|
1481
|
+
{"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "subfolder"}
|
|
1482
|
+
)
|
|
1483
|
+
if kwargs and not set(kwargs.keys()).issubset(valid_kwargs):
|
|
1484
|
+
raise ValueError(
|
|
1485
|
+
f"Some kwargs in {list(kwargs.keys())} are not supported by `MistralCommonBackend.from_pretrained`."
|
|
1486
|
+
)
|
|
1859
1487
|
|
|
1860
1488
|
mode = cls._get_validation_mode(mode)
|
|
1861
1489
|
|
|
@@ -1869,35 +1497,8 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1869
1497
|
local_files_only=local_files_only,
|
|
1870
1498
|
)
|
|
1871
1499
|
else:
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
instruct_versions = list(TokenizerVersion.__members__)
|
|
1876
|
-
mm_versions = list(MultiModalVersion.__members__) + [""] # allow no mm version
|
|
1877
|
-
sentencepiece_suffixes = [f".model.{v}{m}" for v in instruct_versions for m in mm_versions] + [".model"]
|
|
1878
|
-
|
|
1879
|
-
for path in os.listdir(pretrained_model_name_or_path):
|
|
1880
|
-
pathlib_repo_file = Path(path)
|
|
1881
|
-
file_name = pathlib_repo_file.name
|
|
1882
|
-
suffix = "".join(pathlib_repo_file.suffixes)
|
|
1883
|
-
if file_name == "tekken.json" or suffix in sentencepiece_suffixes:
|
|
1884
|
-
valid_tokenizer_files.append(file_name)
|
|
1885
|
-
|
|
1886
|
-
if len(valid_tokenizer_files) == 0:
|
|
1887
|
-
raise ValueError(f"No tokenizer file found in directory: {pretrained_model_name_or_path}")
|
|
1888
|
-
# If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
|
|
1889
|
-
if len(valid_tokenizer_files) > 1:
|
|
1890
|
-
if "tekken.json" in valid_tokenizer_files:
|
|
1891
|
-
tokenizer_file = "tekken.json"
|
|
1892
|
-
else:
|
|
1893
|
-
tokenizer_file = max(valid_tokenizer_files)
|
|
1894
|
-
logger.warning(
|
|
1895
|
-
f"Multiple tokenizer files found in directory: {pretrained_model_name_or_path}. Using {tokenizer_file}."
|
|
1896
|
-
)
|
|
1897
|
-
else:
|
|
1898
|
-
tokenizer_file = valid_tokenizer_files[0]
|
|
1899
|
-
|
|
1900
|
-
tokenizer_path = os.path.join(pretrained_model_name_or_path, tokenizer_file)
|
|
1500
|
+
candidate_files = os.listdir(pretrained_model_name_or_path)
|
|
1501
|
+
tokenizer_path = os.path.join(pretrained_model_name_or_path, get_one_valid_tokenizer_file(candidate_files))
|
|
1901
1502
|
|
|
1902
1503
|
return cls(
|
|
1903
1504
|
tokenizer_path=tokenizer_path,
|
|
@@ -1909,7 +1510,7 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1909
1510
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
|
1910
1511
|
)
|
|
1911
1512
|
|
|
1912
|
-
def save_pretrained(
|
|
1513
|
+
def save_pretrained( # type: ignore[override]
|
|
1913
1514
|
self,
|
|
1914
1515
|
save_directory: str | os.PathLike | Path,
|
|
1915
1516
|
push_to_hub: bool = False,
|
|
@@ -1971,7 +1572,7 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1971
1572
|
return (str(save_directory / self._tokenizer_path.name),)
|
|
1972
1573
|
|
|
1973
1574
|
@staticmethod
|
|
1974
|
-
def _get_validation_mode(mode:
|
|
1575
|
+
def _get_validation_mode(mode: str | ValidationMode) -> ValidationMode:
|
|
1975
1576
|
"""Get the validation mode from a string or a ValidationMode."""
|
|
1976
1577
|
_invalid_mode_msg = (
|
|
1977
1578
|
f"Invalid `mistral-common` tokenizer mode: {mode}. Possible values are 'finetuning' or 'test'."
|
|
@@ -1988,6 +1589,65 @@ class MistralCommonBackend(PushToHubMixin):
|
|
|
1988
1589
|
raise ValueError(_invalid_mode_msg)
|
|
1989
1590
|
return mode
|
|
1990
1591
|
|
|
1592
|
+
def add_special_tokens(
|
|
1593
|
+
self,
|
|
1594
|
+
special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
|
|
1595
|
+
replace_extra_special_tokens: bool = True,
|
|
1596
|
+
):
|
|
1597
|
+
r"""`MistralCommonBackend` does not implement `add_special_tokens` by design.
|
|
1598
|
+
|
|
1599
|
+
If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
|
|
1600
|
+
"""
|
|
1601
|
+
|
|
1602
|
+
raise NotImplementedError("`MistralCommonBackend` does not implement `add_special_tokens`.")
|
|
1603
|
+
|
|
1604
|
+
def add_tokens( # type: ignore[override]
|
|
1605
|
+
self,
|
|
1606
|
+
special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
|
|
1607
|
+
replace_extra_special_tokens: bool = True,
|
|
1608
|
+
):
|
|
1609
|
+
"""
|
|
1610
|
+
`MistralCommonBackend` does not implement `add_special_tokens` by design.
|
|
1611
|
+
|
|
1612
|
+
If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
|
|
1613
|
+
"""
|
|
1614
|
+
|
|
1615
|
+
raise NotImplementedError("`MistralCommonBackend` does not implement `add_tokens`.")
|
|
1616
|
+
|
|
1617
|
+
def convert_added_tokens(cls, obj: AddedToken | Any, save: bool = False, add_type_field: bool = True): # type: ignore[override]
|
|
1618
|
+
"""
|
|
1619
|
+
`MistralCommonBackend` does not implement `convert_added_tokens` by design.
|
|
1620
|
+
|
|
1621
|
+
If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
|
|
1622
|
+
"""
|
|
1623
|
+
|
|
1624
|
+
raise NotImplementedError("`MistralCommonBackend` does not implement `convert_added_tokens`.")
|
|
1625
|
+
|
|
1626
|
+
def get_chat_template(self, chat_template: str | None = None, tools: list[dict] | None = None) -> str:
|
|
1627
|
+
"""`MistralCommonBackend` does not implement `get_chat_template` by design as `mistral-common` does not use chat templates."""
|
|
1628
|
+
|
|
1629
|
+
raise NotImplementedError("`MistralCommonBackend` does not implement `get_chat_template`.")
|
|
1630
|
+
|
|
1631
|
+
def save_chat_templates(
|
|
1632
|
+
self,
|
|
1633
|
+
save_directory: str | os.PathLike,
|
|
1634
|
+
tokenizer_config: dict,
|
|
1635
|
+
filename_prefix: str | None,
|
|
1636
|
+
save_jinja_files: bool,
|
|
1637
|
+
):
|
|
1638
|
+
"""`MistralCommonBackend` does not implement `save_chat_templates` by design as `mistral-common` does not use chat templates."""
|
|
1639
|
+
|
|
1640
|
+
raise NotImplementedError("`MistralCommonBackend` does not implement `save_chat_templates`.")
|
|
1641
|
+
|
|
1642
|
+
def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str, ...]:
|
|
1643
|
+
"""
|
|
1644
|
+
`MistralCommonBackend` does not implement `save_vocabulary` by design.
|
|
1645
|
+
|
|
1646
|
+
This is because `mistral-common` is configured by one tokenizer file. If you'd like to save the vocabulary, please consider using the `save_pretrained` method instead.
|
|
1647
|
+
"""
|
|
1648
|
+
|
|
1649
|
+
raise NotImplementedError("`MistralCommonBackend` does not implement `save_vocabulary`.")
|
|
1650
|
+
|
|
1991
1651
|
|
|
1992
1652
|
# Backward compatibility alias for codebases still importing the legacy name.
|
|
1993
1653
|
MistralCommonTokenizer = MistralCommonBackend
|