transformers 5.0.0__py3-none-any.whl → 5.0.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1606) hide show
  1. transformers/__init__.py +36 -55
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +33 -32
  4. transformers/cache_utils.py +139 -32
  5. transformers/cli/chat.py +3 -3
  6. transformers/cli/serve.py +19 -49
  7. transformers/cli/transformers.py +1 -2
  8. transformers/configuration_utils.py +155 -129
  9. transformers/conversion_mapping.py +22 -158
  10. transformers/convert_slow_tokenizer.py +17 -227
  11. transformers/core_model_loading.py +185 -528
  12. transformers/data/data_collator.py +4 -12
  13. transformers/data/processors/glue.py +1 -0
  14. transformers/data/processors/utils.py +1 -0
  15. transformers/data/processors/xnli.py +1 -0
  16. transformers/dependency_versions_check.py +1 -0
  17. transformers/dependency_versions_table.py +7 -5
  18. transformers/distributed/configuration_utils.py +2 -1
  19. transformers/dynamic_module_utils.py +25 -24
  20. transformers/feature_extraction_sequence_utils.py +23 -19
  21. transformers/feature_extraction_utils.py +33 -64
  22. transformers/file_utils.py +1 -0
  23. transformers/generation/__init__.py +1 -11
  24. transformers/generation/candidate_generator.py +33 -80
  25. transformers/generation/configuration_utils.py +133 -189
  26. transformers/generation/continuous_batching/__init__.py +1 -4
  27. transformers/generation/continuous_batching/cache.py +25 -83
  28. transformers/generation/continuous_batching/cache_manager.py +45 -155
  29. transformers/generation/continuous_batching/continuous_api.py +147 -270
  30. transformers/generation/continuous_batching/requests.py +3 -51
  31. transformers/generation/continuous_batching/scheduler.py +105 -160
  32. transformers/generation/logits_process.py +128 -0
  33. transformers/generation/stopping_criteria.py +1 -1
  34. transformers/generation/streamers.py +1 -0
  35. transformers/generation/utils.py +123 -122
  36. transformers/generation/watermarking.py +6 -8
  37. transformers/hf_argparser.py +13 -9
  38. transformers/hyperparameter_search.py +2 -1
  39. transformers/image_processing_base.py +23 -12
  40. transformers/image_processing_utils.py +15 -11
  41. transformers/image_processing_utils_fast.py +75 -85
  42. transformers/image_transforms.py +42 -73
  43. transformers/image_utils.py +32 -30
  44. transformers/initialization.py +0 -37
  45. transformers/integrations/__init__.py +2 -16
  46. transformers/integrations/accelerate.py +113 -58
  47. transformers/integrations/aqlm.py +66 -36
  48. transformers/integrations/awq.py +516 -45
  49. transformers/integrations/bitnet.py +105 -47
  50. transformers/integrations/bitsandbytes.py +202 -91
  51. transformers/integrations/deepspeed.py +4 -161
  52. transformers/integrations/eetq.py +82 -84
  53. transformers/integrations/executorch.py +1 -1
  54. transformers/integrations/fbgemm_fp8.py +145 -190
  55. transformers/integrations/finegrained_fp8.py +215 -249
  56. transformers/integrations/flash_attention.py +3 -3
  57. transformers/integrations/flex_attention.py +1 -1
  58. transformers/integrations/fp_quant.py +0 -90
  59. transformers/integrations/ggml.py +2 -11
  60. transformers/integrations/higgs.py +62 -37
  61. transformers/integrations/hub_kernels.py +8 -65
  62. transformers/integrations/integration_utils.py +3 -47
  63. transformers/integrations/mistral.py +0 -12
  64. transformers/integrations/mxfp4.py +80 -33
  65. transformers/integrations/peft.py +191 -483
  66. transformers/integrations/quanto.py +56 -77
  67. transformers/integrations/spqr.py +90 -42
  68. transformers/integrations/tensor_parallel.py +221 -167
  69. transformers/integrations/torchao.py +43 -35
  70. transformers/integrations/vptq.py +59 -40
  71. transformers/kernels/__init__.py +0 -0
  72. transformers/{models/pe_audio_video/processing_pe_audio_video.py → kernels/falcon_mamba/__init__.py} +3 -12
  73. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +529 -0
  74. transformers/loss/loss_utils.py +0 -2
  75. transformers/masking_utils.py +55 -51
  76. transformers/model_debugging_utils.py +5 -4
  77. transformers/modelcard.py +194 -15
  78. transformers/modeling_attn_mask_utils.py +19 -19
  79. transformers/modeling_flash_attention_utils.py +27 -27
  80. transformers/modeling_gguf_pytorch_utils.py +24 -79
  81. transformers/modeling_layers.py +22 -21
  82. transformers/modeling_outputs.py +253 -242
  83. transformers/modeling_rope_utils.py +117 -138
  84. transformers/modeling_utils.py +739 -850
  85. transformers/models/__init__.py +0 -27
  86. transformers/models/afmoe/configuration_afmoe.py +33 -40
  87. transformers/models/afmoe/modeling_afmoe.py +54 -42
  88. transformers/models/afmoe/modular_afmoe.py +33 -23
  89. transformers/models/aimv2/configuration_aimv2.py +10 -2
  90. transformers/models/aimv2/modeling_aimv2.py +42 -47
  91. transformers/models/aimv2/modular_aimv2.py +19 -17
  92. transformers/models/albert/configuration_albert.py +2 -8
  93. transformers/models/albert/modeling_albert.py +69 -70
  94. transformers/models/albert/tokenization_albert.py +14 -5
  95. transformers/models/align/configuration_align.py +6 -8
  96. transformers/models/align/modeling_align.py +89 -94
  97. transformers/models/align/processing_align.py +30 -2
  98. transformers/models/altclip/configuration_altclip.py +7 -4
  99. transformers/models/altclip/modeling_altclip.py +103 -114
  100. transformers/models/altclip/processing_altclip.py +15 -2
  101. transformers/models/apertus/__init__.py +1 -0
  102. transformers/models/apertus/configuration_apertus.py +28 -23
  103. transformers/models/apertus/modeling_apertus.py +40 -39
  104. transformers/models/apertus/modular_apertus.py +38 -37
  105. transformers/models/arcee/configuration_arcee.py +30 -25
  106. transformers/models/arcee/modeling_arcee.py +39 -36
  107. transformers/models/arcee/modular_arcee.py +23 -20
  108. transformers/models/aria/configuration_aria.py +44 -31
  109. transformers/models/aria/image_processing_aria.py +27 -25
  110. transformers/models/aria/modeling_aria.py +106 -110
  111. transformers/models/aria/modular_aria.py +127 -118
  112. transformers/models/aria/processing_aria.py +35 -28
  113. transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +1 -0
  114. transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +6 -3
  115. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +8 -6
  116. transformers/models/audioflamingo3/__init__.py +1 -0
  117. transformers/models/audioflamingo3/configuration_audioflamingo3.py +1 -0
  118. transformers/models/audioflamingo3/modeling_audioflamingo3.py +49 -58
  119. transformers/models/audioflamingo3/modular_audioflamingo3.py +43 -53
  120. transformers/models/audioflamingo3/processing_audioflamingo3.py +30 -33
  121. transformers/models/auto/auto_factory.py +7 -6
  122. transformers/models/auto/configuration_auto.py +5 -66
  123. transformers/models/auto/feature_extraction_auto.py +10 -14
  124. transformers/models/auto/image_processing_auto.py +41 -32
  125. transformers/models/auto/modeling_auto.py +188 -46
  126. transformers/models/auto/processing_auto.py +11 -24
  127. transformers/models/auto/tokenization_auto.py +588 -171
  128. transformers/models/auto/video_processing_auto.py +10 -12
  129. transformers/models/autoformer/configuration_autoformer.py +7 -4
  130. transformers/models/autoformer/modeling_autoformer.py +101 -104
  131. transformers/models/aya_vision/configuration_aya_vision.py +1 -4
  132. transformers/models/aya_vision/modeling_aya_vision.py +102 -71
  133. transformers/models/aya_vision/modular_aya_vision.py +74 -46
  134. transformers/models/aya_vision/processing_aya_vision.py +53 -25
  135. transformers/models/bamba/configuration_bamba.py +39 -34
  136. transformers/models/bamba/modeling_bamba.py +86 -82
  137. transformers/models/bamba/modular_bamba.py +72 -70
  138. transformers/models/bark/configuration_bark.py +8 -6
  139. transformers/models/bark/generation_configuration_bark.py +5 -3
  140. transformers/models/bark/modeling_bark.py +57 -54
  141. transformers/models/bark/processing_bark.py +41 -19
  142. transformers/models/bart/configuration_bart.py +6 -9
  143. transformers/models/bart/modeling_bart.py +126 -135
  144. transformers/models/barthez/tokenization_barthez.py +11 -3
  145. transformers/models/bartpho/tokenization_bartpho.py +7 -6
  146. transformers/models/beit/configuration_beit.py +11 -0
  147. transformers/models/beit/image_processing_beit.py +56 -53
  148. transformers/models/beit/image_processing_beit_fast.py +12 -10
  149. transformers/models/beit/modeling_beit.py +60 -69
  150. transformers/models/bert/configuration_bert.py +2 -12
  151. transformers/models/bert/modeling_bert.py +122 -114
  152. transformers/models/bert/tokenization_bert.py +23 -8
  153. transformers/models/bert/tokenization_bert_legacy.py +5 -3
  154. transformers/models/bert_generation/configuration_bert_generation.py +2 -17
  155. transformers/models/bert_generation/modeling_bert_generation.py +49 -49
  156. transformers/models/bert_generation/tokenization_bert_generation.py +3 -2
  157. transformers/models/bert_japanese/tokenization_bert_japanese.py +6 -5
  158. transformers/models/bertweet/tokenization_bertweet.py +3 -1
  159. transformers/models/big_bird/configuration_big_bird.py +9 -12
  160. transformers/models/big_bird/modeling_big_bird.py +109 -116
  161. transformers/models/big_bird/tokenization_big_bird.py +43 -16
  162. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -9
  163. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +117 -130
  164. transformers/models/biogpt/configuration_biogpt.py +2 -8
  165. transformers/models/biogpt/modeling_biogpt.py +76 -72
  166. transformers/models/biogpt/modular_biogpt.py +66 -62
  167. transformers/models/biogpt/tokenization_biogpt.py +5 -3
  168. transformers/models/bit/configuration_bit.py +1 -0
  169. transformers/models/bit/image_processing_bit.py +24 -21
  170. transformers/models/bit/image_processing_bit_fast.py +1 -0
  171. transformers/models/bit/modeling_bit.py +12 -25
  172. transformers/models/bitnet/configuration_bitnet.py +28 -23
  173. transformers/models/bitnet/modeling_bitnet.py +39 -36
  174. transformers/models/bitnet/modular_bitnet.py +6 -4
  175. transformers/models/blenderbot/configuration_blenderbot.py +5 -8
  176. transformers/models/blenderbot/modeling_blenderbot.py +96 -77
  177. transformers/models/blenderbot/tokenization_blenderbot.py +24 -18
  178. transformers/models/blenderbot_small/configuration_blenderbot_small.py +5 -8
  179. transformers/models/blenderbot_small/modeling_blenderbot_small.py +69 -79
  180. transformers/models/blenderbot_small/tokenization_blenderbot_small.py +3 -1
  181. transformers/models/blip/configuration_blip.py +10 -9
  182. transformers/models/blip/image_processing_blip.py +20 -17
  183. transformers/models/blip/image_processing_blip_fast.py +1 -0
  184. transformers/models/blip/modeling_blip.py +108 -117
  185. transformers/models/blip/modeling_blip_text.py +65 -73
  186. transformers/models/blip/processing_blip.py +36 -5
  187. transformers/models/blip_2/configuration_blip_2.py +2 -2
  188. transformers/models/blip_2/modeling_blip_2.py +118 -146
  189. transformers/models/blip_2/processing_blip_2.py +38 -8
  190. transformers/models/bloom/configuration_bloom.py +2 -5
  191. transformers/models/bloom/modeling_bloom.py +104 -77
  192. transformers/models/blt/configuration_blt.py +86 -94
  193. transformers/models/blt/modeling_blt.py +81 -238
  194. transformers/models/blt/modular_blt.py +65 -228
  195. transformers/models/bridgetower/configuration_bridgetower.py +2 -7
  196. transformers/models/bridgetower/image_processing_bridgetower.py +35 -34
  197. transformers/models/bridgetower/image_processing_bridgetower_fast.py +16 -13
  198. transformers/models/bridgetower/modeling_bridgetower.py +119 -141
  199. transformers/models/bridgetower/processing_bridgetower.py +16 -2
  200. transformers/models/bros/configuration_bros.py +18 -24
  201. transformers/models/bros/modeling_bros.py +80 -90
  202. transformers/models/bros/processing_bros.py +12 -2
  203. transformers/models/byt5/tokenization_byt5.py +6 -4
  204. transformers/models/camembert/configuration_camembert.py +2 -8
  205. transformers/models/camembert/modeling_camembert.py +195 -196
  206. transformers/models/camembert/modular_camembert.py +54 -51
  207. transformers/models/camembert/tokenization_camembert.py +13 -6
  208. transformers/models/canine/configuration_canine.py +2 -4
  209. transformers/models/canine/modeling_canine.py +75 -84
  210. transformers/models/canine/tokenization_canine.py +1 -2
  211. transformers/models/chameleon/configuration_chameleon.py +34 -29
  212. transformers/models/chameleon/image_processing_chameleon.py +24 -21
  213. transformers/models/chameleon/image_processing_chameleon_fast.py +6 -5
  214. transformers/models/chameleon/modeling_chameleon.py +93 -142
  215. transformers/models/chameleon/processing_chameleon.py +41 -16
  216. transformers/models/chinese_clip/configuration_chinese_clip.py +8 -10
  217. transformers/models/chinese_clip/image_processing_chinese_clip.py +24 -21
  218. transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +1 -0
  219. transformers/models/chinese_clip/modeling_chinese_clip.py +92 -96
  220. transformers/models/chinese_clip/processing_chinese_clip.py +15 -2
  221. transformers/models/clap/configuration_clap.py +9 -4
  222. transformers/models/clap/feature_extraction_clap.py +12 -11
  223. transformers/models/clap/modeling_clap.py +123 -136
  224. transformers/models/clap/processing_clap.py +15 -2
  225. transformers/models/clip/configuration_clip.py +2 -4
  226. transformers/models/clip/image_processing_clip.py +24 -21
  227. transformers/models/clip/image_processing_clip_fast.py +1 -9
  228. transformers/models/clip/modeling_clip.py +65 -65
  229. transformers/models/clip/processing_clip.py +14 -2
  230. transformers/models/clip/tokenization_clip.py +46 -21
  231. transformers/models/clipseg/configuration_clipseg.py +2 -4
  232. transformers/models/clipseg/modeling_clipseg.py +109 -119
  233. transformers/models/clipseg/processing_clipseg.py +42 -19
  234. transformers/models/clvp/configuration_clvp.py +5 -15
  235. transformers/models/clvp/feature_extraction_clvp.py +10 -7
  236. transformers/models/clvp/modeling_clvp.py +146 -155
  237. transformers/models/clvp/number_normalizer.py +2 -1
  238. transformers/models/clvp/processing_clvp.py +20 -3
  239. transformers/models/clvp/tokenization_clvp.py +64 -1
  240. transformers/models/code_llama/tokenization_code_llama.py +44 -18
  241. transformers/models/codegen/configuration_codegen.py +4 -4
  242. transformers/models/codegen/modeling_codegen.py +53 -63
  243. transformers/models/codegen/tokenization_codegen.py +47 -17
  244. transformers/models/cohere/configuration_cohere.py +30 -25
  245. transformers/models/cohere/modeling_cohere.py +42 -40
  246. transformers/models/cohere/modular_cohere.py +29 -26
  247. transformers/models/cohere/tokenization_cohere.py +46 -15
  248. transformers/models/cohere2/configuration_cohere2.py +32 -31
  249. transformers/models/cohere2/modeling_cohere2.py +44 -42
  250. transformers/models/cohere2/modular_cohere2.py +54 -54
  251. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +14 -13
  252. transformers/models/cohere2_vision/modeling_cohere2_vision.py +58 -59
  253. transformers/models/cohere2_vision/modular_cohere2_vision.py +46 -45
  254. transformers/models/cohere2_vision/processing_cohere2_vision.py +36 -6
  255. transformers/models/colpali/configuration_colpali.py +1 -0
  256. transformers/models/colpali/modeling_colpali.py +16 -14
  257. transformers/models/colpali/modular_colpali.py +51 -11
  258. transformers/models/colpali/processing_colpali.py +52 -14
  259. transformers/models/colqwen2/modeling_colqwen2.py +28 -28
  260. transformers/models/colqwen2/modular_colqwen2.py +74 -37
  261. transformers/models/colqwen2/processing_colqwen2.py +52 -16
  262. transformers/models/conditional_detr/configuration_conditional_detr.py +2 -1
  263. transformers/models/conditional_detr/image_processing_conditional_detr.py +70 -67
  264. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +36 -36
  265. transformers/models/conditional_detr/modeling_conditional_detr.py +87 -99
  266. transformers/models/conditional_detr/modular_conditional_detr.py +3 -49
  267. transformers/models/convbert/configuration_convbert.py +8 -11
  268. transformers/models/convbert/modeling_convbert.py +87 -94
  269. transformers/models/convbert/tokenization_convbert.py +1 -0
  270. transformers/models/convnext/configuration_convnext.py +1 -0
  271. transformers/models/convnext/image_processing_convnext.py +23 -20
  272. transformers/models/convnext/image_processing_convnext_fast.py +21 -16
  273. transformers/models/convnext/modeling_convnext.py +12 -9
  274. transformers/models/convnextv2/configuration_convnextv2.py +1 -0
  275. transformers/models/convnextv2/modeling_convnextv2.py +12 -9
  276. transformers/models/cpm/tokenization_cpm.py +7 -6
  277. transformers/models/cpm/tokenization_cpm_fast.py +5 -3
  278. transformers/models/cpmant/configuration_cpmant.py +1 -4
  279. transformers/models/cpmant/modeling_cpmant.py +40 -38
  280. transformers/models/cpmant/tokenization_cpmant.py +3 -1
  281. transformers/models/csm/configuration_csm.py +66 -58
  282. transformers/models/csm/generation_csm.py +35 -31
  283. transformers/models/csm/modeling_csm.py +85 -85
  284. transformers/models/csm/modular_csm.py +58 -58
  285. transformers/models/csm/processing_csm.py +68 -25
  286. transformers/models/ctrl/configuration_ctrl.py +1 -16
  287. transformers/models/ctrl/modeling_ctrl.py +44 -54
  288. transformers/models/ctrl/tokenization_ctrl.py +1 -0
  289. transformers/models/cvt/configuration_cvt.py +1 -0
  290. transformers/models/cvt/modeling_cvt.py +16 -20
  291. transformers/models/cwm/__init__.py +1 -0
  292. transformers/models/cwm/configuration_cwm.py +12 -8
  293. transformers/models/cwm/modeling_cwm.py +39 -37
  294. transformers/models/cwm/modular_cwm.py +12 -10
  295. transformers/models/d_fine/configuration_d_fine.py +5 -7
  296. transformers/models/d_fine/modeling_d_fine.py +128 -138
  297. transformers/models/d_fine/modular_d_fine.py +18 -33
  298. transformers/models/dab_detr/configuration_dab_detr.py +3 -6
  299. transformers/models/dab_detr/modeling_dab_detr.py +75 -81
  300. transformers/models/dac/configuration_dac.py +1 -0
  301. transformers/models/dac/feature_extraction_dac.py +9 -6
  302. transformers/models/dac/modeling_dac.py +26 -24
  303. transformers/models/data2vec/configuration_data2vec_audio.py +2 -4
  304. transformers/models/data2vec/configuration_data2vec_text.py +3 -11
  305. transformers/models/data2vec/configuration_data2vec_vision.py +1 -0
  306. transformers/models/data2vec/modeling_data2vec_audio.py +56 -57
  307. transformers/models/data2vec/modeling_data2vec_text.py +93 -98
  308. transformers/models/data2vec/modeling_data2vec_vision.py +45 -49
  309. transformers/models/data2vec/modular_data2vec_audio.py +1 -6
  310. transformers/models/data2vec/modular_data2vec_text.py +54 -58
  311. transformers/models/dbrx/configuration_dbrx.py +22 -36
  312. transformers/models/dbrx/modeling_dbrx.py +45 -42
  313. transformers/models/dbrx/modular_dbrx.py +33 -31
  314. transformers/models/deberta/configuration_deberta.py +1 -6
  315. transformers/models/deberta/modeling_deberta.py +60 -64
  316. transformers/models/deberta/tokenization_deberta.py +21 -9
  317. transformers/models/deberta_v2/configuration_deberta_v2.py +1 -6
  318. transformers/models/deberta_v2/modeling_deberta_v2.py +65 -71
  319. transformers/models/deberta_v2/tokenization_deberta_v2.py +29 -11
  320. transformers/models/decision_transformer/configuration_decision_transformer.py +2 -3
  321. transformers/models/decision_transformer/modeling_decision_transformer.py +56 -60
  322. transformers/models/deepseek_v2/configuration_deepseek_v2.py +44 -39
  323. transformers/models/deepseek_v2/modeling_deepseek_v2.py +43 -43
  324. transformers/models/deepseek_v2/modular_deepseek_v2.py +49 -48
  325. transformers/models/deepseek_v3/configuration_deepseek_v3.py +45 -40
  326. transformers/models/deepseek_v3/modeling_deepseek_v3.py +42 -45
  327. transformers/models/deepseek_v3/modular_deepseek_v3.py +9 -14
  328. transformers/models/deepseek_vl/configuration_deepseek_vl.py +3 -2
  329. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +26 -25
  330. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +10 -10
  331. transformers/models/deepseek_vl/modeling_deepseek_vl.py +48 -57
  332. transformers/models/deepseek_vl/modular_deepseek_vl.py +43 -14
  333. transformers/models/deepseek_vl/processing_deepseek_vl.py +41 -10
  334. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +5 -3
  335. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +35 -35
  336. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +24 -20
  337. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +61 -109
  338. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +118 -146
  339. transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +44 -12
  340. transformers/models/deformable_detr/configuration_deformable_detr.py +3 -2
  341. transformers/models/deformable_detr/image_processing_deformable_detr.py +61 -59
  342. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +28 -28
  343. transformers/models/deformable_detr/modeling_deformable_detr.py +82 -88
  344. transformers/models/deformable_detr/modular_deformable_detr.py +3 -1
  345. transformers/models/deit/configuration_deit.py +1 -0
  346. transformers/models/deit/image_processing_deit.py +21 -18
  347. transformers/models/deit/image_processing_deit_fast.py +1 -0
  348. transformers/models/deit/modeling_deit.py +22 -24
  349. transformers/models/depth_anything/configuration_depth_anything.py +4 -2
  350. transformers/models/depth_anything/modeling_depth_anything.py +10 -10
  351. transformers/models/depth_pro/configuration_depth_pro.py +1 -0
  352. transformers/models/depth_pro/image_processing_depth_pro.py +23 -22
  353. transformers/models/depth_pro/image_processing_depth_pro_fast.py +10 -8
  354. transformers/models/depth_pro/modeling_depth_pro.py +27 -31
  355. transformers/models/detr/configuration_detr.py +2 -1
  356. transformers/models/detr/image_processing_detr.py +66 -64
  357. transformers/models/detr/image_processing_detr_fast.py +34 -33
  358. transformers/models/detr/modeling_detr.py +79 -95
  359. transformers/models/dia/configuration_dia.py +15 -9
  360. transformers/models/dia/feature_extraction_dia.py +9 -6
  361. transformers/models/dia/generation_dia.py +50 -48
  362. transformers/models/dia/modeling_dia.py +69 -78
  363. transformers/models/dia/modular_dia.py +56 -64
  364. transformers/models/dia/processing_dia.py +29 -39
  365. transformers/models/dia/tokenization_dia.py +6 -3
  366. transformers/models/diffllama/configuration_diffllama.py +30 -25
  367. transformers/models/diffllama/modeling_diffllama.py +49 -46
  368. transformers/models/diffllama/modular_diffllama.py +19 -17
  369. transformers/models/dinat/configuration_dinat.py +1 -0
  370. transformers/models/dinat/modeling_dinat.py +44 -47
  371. transformers/models/dinov2/configuration_dinov2.py +1 -0
  372. transformers/models/dinov2/modeling_dinov2.py +15 -15
  373. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +1 -1
  374. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +15 -16
  375. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +9 -9
  376. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +7 -4
  377. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +6 -3
  378. transformers/models/dinov3_vit/configuration_dinov3_vit.py +8 -5
  379. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +9 -7
  380. transformers/models/dinov3_vit/modeling_dinov3_vit.py +18 -19
  381. transformers/models/dinov3_vit/modular_dinov3_vit.py +15 -16
  382. transformers/models/distilbert/configuration_distilbert.py +2 -8
  383. transformers/models/distilbert/modeling_distilbert.py +55 -55
  384. transformers/models/distilbert/tokenization_distilbert.py +1 -13
  385. transformers/models/doge/__init__.py +1 -0
  386. transformers/models/doge/configuration_doge.py +32 -39
  387. transformers/models/doge/modeling_doge.py +49 -45
  388. transformers/models/doge/modular_doge.py +63 -71
  389. transformers/models/donut/configuration_donut_swin.py +1 -0
  390. transformers/models/donut/image_processing_donut.py +29 -26
  391. transformers/models/donut/image_processing_donut_fast.py +15 -9
  392. transformers/models/donut/modeling_donut_swin.py +58 -62
  393. transformers/models/donut/processing_donut.py +26 -5
  394. transformers/models/dots1/configuration_dots1.py +33 -41
  395. transformers/models/dots1/modeling_dots1.py +45 -54
  396. transformers/models/dots1/modular_dots1.py +4 -5
  397. transformers/models/dpr/configuration_dpr.py +2 -19
  398. transformers/models/dpr/modeling_dpr.py +39 -42
  399. transformers/models/dpr/tokenization_dpr.py +9 -19
  400. transformers/models/dpr/tokenization_dpr_fast.py +9 -7
  401. transformers/models/dpt/configuration_dpt.py +2 -1
  402. transformers/models/dpt/image_processing_dpt.py +66 -65
  403. transformers/models/dpt/image_processing_dpt_fast.py +20 -18
  404. transformers/models/dpt/modeling_dpt.py +30 -32
  405. transformers/models/dpt/modular_dpt.py +17 -15
  406. transformers/models/edgetam/configuration_edgetam.py +3 -2
  407. transformers/models/edgetam/modeling_edgetam.py +86 -86
  408. transformers/models/edgetam/modular_edgetam.py +26 -21
  409. transformers/models/edgetam_video/__init__.py +1 -0
  410. transformers/models/edgetam_video/configuration_edgetam_video.py +1 -0
  411. transformers/models/edgetam_video/modeling_edgetam_video.py +158 -169
  412. transformers/models/edgetam_video/modular_edgetam_video.py +37 -30
  413. transformers/models/efficientloftr/configuration_efficientloftr.py +5 -4
  414. transformers/models/efficientloftr/image_processing_efficientloftr.py +16 -14
  415. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +9 -9
  416. transformers/models/efficientloftr/modeling_efficientloftr.py +38 -59
  417. transformers/models/efficientloftr/modular_efficientloftr.py +3 -1
  418. transformers/models/efficientnet/configuration_efficientnet.py +1 -0
  419. transformers/models/efficientnet/image_processing_efficientnet.py +32 -28
  420. transformers/models/efficientnet/image_processing_efficientnet_fast.py +19 -17
  421. transformers/models/efficientnet/modeling_efficientnet.py +15 -19
  422. transformers/models/electra/configuration_electra.py +3 -13
  423. transformers/models/electra/modeling_electra.py +103 -108
  424. transformers/models/emu3/configuration_emu3.py +17 -13
  425. transformers/models/emu3/image_processing_emu3.py +39 -44
  426. transformers/models/emu3/modeling_emu3.py +108 -148
  427. transformers/models/emu3/modular_emu3.py +73 -115
  428. transformers/models/emu3/processing_emu3.py +43 -18
  429. transformers/models/encodec/configuration_encodec.py +4 -2
  430. transformers/models/encodec/feature_extraction_encodec.py +13 -10
  431. transformers/models/encodec/modeling_encodec.py +29 -39
  432. transformers/models/encoder_decoder/configuration_encoder_decoder.py +2 -12
  433. transformers/models/encoder_decoder/modeling_encoder_decoder.py +43 -37
  434. transformers/models/eomt/configuration_eomt.py +1 -0
  435. transformers/models/eomt/image_processing_eomt.py +56 -66
  436. transformers/models/eomt/image_processing_eomt_fast.py +33 -76
  437. transformers/models/eomt/modeling_eomt.py +18 -23
  438. transformers/models/eomt/modular_eomt.py +13 -18
  439. transformers/models/ernie/configuration_ernie.py +3 -24
  440. transformers/models/ernie/modeling_ernie.py +132 -127
  441. transformers/models/ernie/modular_ernie.py +103 -97
  442. transformers/models/ernie4_5/configuration_ernie4_5.py +27 -23
  443. transformers/models/ernie4_5/modeling_ernie4_5.py +38 -36
  444. transformers/models/ernie4_5/modular_ernie4_5.py +4 -3
  445. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +36 -32
  446. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +55 -56
  447. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +46 -18
  448. transformers/models/esm/configuration_esm.py +15 -11
  449. transformers/models/esm/modeling_esm.py +34 -38
  450. transformers/models/esm/modeling_esmfold.py +49 -53
  451. transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
  452. transformers/models/esm/openfold_utils/loss.py +2 -1
  453. transformers/models/esm/openfold_utils/protein.py +16 -15
  454. transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
  455. transformers/models/esm/tokenization_esm.py +4 -2
  456. transformers/models/evolla/configuration_evolla.py +40 -50
  457. transformers/models/evolla/modeling_evolla.py +66 -71
  458. transformers/models/evolla/modular_evolla.py +47 -53
  459. transformers/models/evolla/processing_evolla.py +35 -23
  460. transformers/models/exaone4/configuration_exaone4.py +25 -23
  461. transformers/models/exaone4/modeling_exaone4.py +38 -35
  462. transformers/models/exaone4/modular_exaone4.py +46 -44
  463. transformers/models/falcon/configuration_falcon.py +26 -31
  464. transformers/models/falcon/modeling_falcon.py +80 -82
  465. transformers/models/falcon_h1/configuration_falcon_h1.py +51 -45
  466. transformers/models/falcon_h1/modeling_falcon_h1.py +82 -85
  467. transformers/models/falcon_h1/modular_falcon_h1.py +51 -56
  468. transformers/models/falcon_mamba/configuration_falcon_mamba.py +2 -1
  469. transformers/models/falcon_mamba/modeling_falcon_mamba.py +82 -75
  470. transformers/models/falcon_mamba/modular_falcon_mamba.py +45 -28
  471. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +6 -2
  472. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +60 -76
  473. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +3 -2
  474. transformers/models/flaubert/configuration_flaubert.py +5 -10
  475. transformers/models/flaubert/modeling_flaubert.py +143 -145
  476. transformers/models/flaubert/tokenization_flaubert.py +5 -3
  477. transformers/models/flava/configuration_flava.py +6 -5
  478. transformers/models/flava/image_processing_flava.py +67 -66
  479. transformers/models/flava/image_processing_flava_fast.py +49 -46
  480. transformers/models/flava/modeling_flava.py +136 -153
  481. transformers/models/flava/processing_flava.py +12 -2
  482. transformers/models/flex_olmo/__init__.py +1 -0
  483. transformers/models/flex_olmo/configuration_flex_olmo.py +32 -28
  484. transformers/models/flex_olmo/modeling_flex_olmo.py +47 -47
  485. transformers/models/flex_olmo/modular_flex_olmo.py +44 -40
  486. transformers/models/florence2/configuration_florence2.py +1 -0
  487. transformers/models/florence2/modeling_florence2.py +69 -111
  488. transformers/models/florence2/modular_florence2.py +101 -104
  489. transformers/models/florence2/processing_florence2.py +47 -18
  490. transformers/models/fnet/configuration_fnet.py +2 -6
  491. transformers/models/fnet/modeling_fnet.py +80 -83
  492. transformers/models/fnet/tokenization_fnet.py +1 -0
  493. transformers/models/focalnet/configuration_focalnet.py +1 -0
  494. transformers/models/focalnet/modeling_focalnet.py +45 -51
  495. transformers/models/fsmt/configuration_fsmt.py +17 -12
  496. transformers/models/fsmt/modeling_fsmt.py +48 -49
  497. transformers/models/fsmt/tokenization_fsmt.py +5 -3
  498. transformers/models/funnel/configuration_funnel.py +1 -8
  499. transformers/models/funnel/modeling_funnel.py +93 -99
  500. transformers/models/funnel/tokenization_funnel.py +27 -17
  501. transformers/models/fuyu/configuration_fuyu.py +34 -28
  502. transformers/models/fuyu/image_processing_fuyu.py +31 -29
  503. transformers/models/fuyu/image_processing_fuyu_fast.py +17 -17
  504. transformers/models/fuyu/modeling_fuyu.py +53 -53
  505. transformers/models/fuyu/processing_fuyu.py +34 -23
  506. transformers/models/gemma/configuration_gemma.py +30 -25
  507. transformers/models/gemma/modeling_gemma.py +50 -46
  508. transformers/models/gemma/modular_gemma.py +47 -42
  509. transformers/models/gemma/tokenization_gemma.py +30 -10
  510. transformers/models/gemma2/configuration_gemma2.py +35 -30
  511. transformers/models/gemma2/modeling_gemma2.py +42 -39
  512. transformers/models/gemma2/modular_gemma2.py +66 -63
  513. transformers/models/gemma3/configuration_gemma3.py +44 -44
  514. transformers/models/gemma3/image_processing_gemma3.py +31 -29
  515. transformers/models/gemma3/image_processing_gemma3_fast.py +13 -11
  516. transformers/models/gemma3/modeling_gemma3.py +207 -159
  517. transformers/models/gemma3/modular_gemma3.py +204 -153
  518. transformers/models/gemma3/processing_gemma3.py +5 -5
  519. transformers/models/gemma3n/configuration_gemma3n.py +26 -36
  520. transformers/models/gemma3n/feature_extraction_gemma3n.py +11 -9
  521. transformers/models/gemma3n/modeling_gemma3n.py +356 -222
  522. transformers/models/gemma3n/modular_gemma3n.py +207 -230
  523. transformers/models/gemma3n/processing_gemma3n.py +26 -12
  524. transformers/models/git/configuration_git.py +8 -5
  525. transformers/models/git/modeling_git.py +204 -266
  526. transformers/models/git/processing_git.py +14 -2
  527. transformers/models/glm/configuration_glm.py +28 -24
  528. transformers/models/glm/modeling_glm.py +40 -37
  529. transformers/models/glm/modular_glm.py +7 -4
  530. transformers/models/glm4/configuration_glm4.py +28 -24
  531. transformers/models/glm4/modeling_glm4.py +42 -40
  532. transformers/models/glm4/modular_glm4.py +10 -8
  533. transformers/models/glm46v/configuration_glm46v.py +1 -0
  534. transformers/models/glm46v/image_processing_glm46v.py +40 -35
  535. transformers/models/glm46v/image_processing_glm46v_fast.py +9 -9
  536. transformers/models/glm46v/modeling_glm46v.py +90 -137
  537. transformers/models/glm46v/modular_glm46v.py +3 -4
  538. transformers/models/glm46v/processing_glm46v.py +41 -7
  539. transformers/models/glm46v/video_processing_glm46v.py +11 -9
  540. transformers/models/glm4_moe/configuration_glm4_moe.py +32 -40
  541. transformers/models/glm4_moe/modeling_glm4_moe.py +42 -45
  542. transformers/models/glm4_moe/modular_glm4_moe.py +34 -42
  543. transformers/models/glm4v/configuration_glm4v.py +20 -18
  544. transformers/models/glm4v/image_processing_glm4v.py +40 -34
  545. transformers/models/glm4v/image_processing_glm4v_fast.py +9 -8
  546. transformers/models/glm4v/modeling_glm4v.py +205 -254
  547. transformers/models/glm4v/modular_glm4v.py +224 -210
  548. transformers/models/glm4v/processing_glm4v.py +41 -7
  549. transformers/models/glm4v/video_processing_glm4v.py +11 -9
  550. transformers/models/glm4v_moe/configuration_glm4v_moe.py +125 -136
  551. transformers/models/glm4v_moe/modeling_glm4v_moe.py +368 -377
  552. transformers/models/glm4v_moe/modular_glm4v_moe.py +169 -83
  553. transformers/models/glpn/configuration_glpn.py +1 -0
  554. transformers/models/glpn/image_processing_glpn.py +12 -11
  555. transformers/models/glpn/image_processing_glpn_fast.py +13 -11
  556. transformers/models/glpn/modeling_glpn.py +14 -16
  557. transformers/models/got_ocr2/configuration_got_ocr2.py +12 -4
  558. transformers/models/got_ocr2/image_processing_got_ocr2.py +24 -22
  559. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +11 -9
  560. transformers/models/got_ocr2/modeling_got_ocr2.py +80 -77
  561. transformers/models/got_ocr2/modular_got_ocr2.py +51 -54
  562. transformers/models/got_ocr2/processing_got_ocr2.py +63 -42
  563. transformers/models/gpt2/configuration_gpt2.py +2 -13
  564. transformers/models/gpt2/modeling_gpt2.py +115 -120
  565. transformers/models/gpt2/tokenization_gpt2.py +46 -15
  566. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +2 -5
  567. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +89 -79
  568. transformers/models/gpt_neo/configuration_gpt_neo.py +2 -9
  569. transformers/models/gpt_neo/modeling_gpt_neo.py +67 -83
  570. transformers/models/gpt_neox/configuration_gpt_neox.py +25 -25
  571. transformers/models/gpt_neox/modeling_gpt_neox.py +75 -76
  572. transformers/models/gpt_neox/modular_gpt_neox.py +66 -67
  573. transformers/models/gpt_neox/tokenization_gpt_neox.py +51 -9
  574. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +19 -24
  575. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +47 -46
  576. transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +3 -1
  577. transformers/models/gpt_oss/configuration_gpt_oss.py +28 -46
  578. transformers/models/gpt_oss/modeling_gpt_oss.py +121 -83
  579. transformers/models/gpt_oss/modular_gpt_oss.py +103 -64
  580. transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
  581. transformers/models/gptj/configuration_gptj.py +4 -4
  582. transformers/models/gptj/modeling_gptj.py +87 -101
  583. transformers/models/granite/configuration_granite.py +33 -28
  584. transformers/models/granite/modeling_granite.py +46 -44
  585. transformers/models/granite/modular_granite.py +31 -29
  586. transformers/models/granite_speech/configuration_granite_speech.py +1 -0
  587. transformers/models/granite_speech/feature_extraction_granite_speech.py +3 -1
  588. transformers/models/granite_speech/modeling_granite_speech.py +52 -82
  589. transformers/models/granite_speech/processing_granite_speech.py +4 -11
  590. transformers/models/granitemoe/configuration_granitemoe.py +36 -31
  591. transformers/models/granitemoe/modeling_granitemoe.py +46 -41
  592. transformers/models/granitemoe/modular_granitemoe.py +27 -22
  593. transformers/models/granitemoehybrid/__init__.py +1 -0
  594. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +47 -46
  595. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +93 -97
  596. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +21 -54
  597. transformers/models/granitemoeshared/configuration_granitemoeshared.py +37 -33
  598. transformers/models/granitemoeshared/modeling_granitemoeshared.py +61 -54
  599. transformers/models/granitemoeshared/modular_granitemoeshared.py +21 -19
  600. transformers/models/grounding_dino/configuration_grounding_dino.py +4 -6
  601. transformers/models/grounding_dino/image_processing_grounding_dino.py +62 -60
  602. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +29 -28
  603. transformers/models/grounding_dino/modeling_grounding_dino.py +140 -155
  604. transformers/models/grounding_dino/modular_grounding_dino.py +3 -2
  605. transformers/models/grounding_dino/processing_grounding_dino.py +38 -10
  606. transformers/models/groupvit/configuration_groupvit.py +2 -4
  607. transformers/models/groupvit/modeling_groupvit.py +93 -107
  608. transformers/models/helium/configuration_helium.py +29 -25
  609. transformers/models/helium/modeling_helium.py +40 -38
  610. transformers/models/helium/modular_helium.py +7 -3
  611. transformers/models/herbert/tokenization_herbert.py +28 -10
  612. transformers/models/hgnet_v2/configuration_hgnet_v2.py +1 -0
  613. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -24
  614. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -24
  615. transformers/models/hiera/configuration_hiera.py +1 -0
  616. transformers/models/hiera/modeling_hiera.py +66 -72
  617. transformers/models/hubert/configuration_hubert.py +2 -4
  618. transformers/models/hubert/modeling_hubert.py +37 -42
  619. transformers/models/hubert/modular_hubert.py +11 -13
  620. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +31 -26
  621. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +38 -35
  622. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +6 -4
  623. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  624. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +36 -31
  625. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +42 -47
  626. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +9 -9
  627. transformers/models/ibert/configuration_ibert.py +2 -4
  628. transformers/models/ibert/modeling_ibert.py +62 -82
  629. transformers/models/ibert/quant_modules.py +1 -0
  630. transformers/models/idefics/configuration_idefics.py +8 -5
  631. transformers/models/idefics/image_processing_idefics.py +15 -13
  632. transformers/models/idefics/modeling_idefics.py +82 -75
  633. transformers/models/idefics/perceiver.py +3 -1
  634. transformers/models/idefics/processing_idefics.py +48 -32
  635. transformers/models/idefics/vision.py +25 -24
  636. transformers/models/idefics2/configuration_idefics2.py +3 -1
  637. transformers/models/idefics2/image_processing_idefics2.py +32 -31
  638. transformers/models/idefics2/image_processing_idefics2_fast.py +8 -8
  639. transformers/models/idefics2/modeling_idefics2.py +101 -127
  640. transformers/models/idefics2/processing_idefics2.py +68 -10
  641. transformers/models/idefics3/configuration_idefics3.py +4 -1
  642. transformers/models/idefics3/image_processing_idefics3.py +43 -42
  643. transformers/models/idefics3/image_processing_idefics3_fast.py +15 -40
  644. transformers/models/idefics3/modeling_idefics3.py +90 -115
  645. transformers/models/idefics3/processing_idefics3.py +69 -15
  646. transformers/models/ijepa/configuration_ijepa.py +1 -0
  647. transformers/models/ijepa/modeling_ijepa.py +11 -10
  648. transformers/models/ijepa/modular_ijepa.py +7 -5
  649. transformers/models/imagegpt/configuration_imagegpt.py +2 -9
  650. transformers/models/imagegpt/image_processing_imagegpt.py +18 -17
  651. transformers/models/imagegpt/image_processing_imagegpt_fast.py +16 -11
  652. transformers/models/imagegpt/modeling_imagegpt.py +65 -76
  653. transformers/models/informer/configuration_informer.py +9 -6
  654. transformers/models/informer/modeling_informer.py +86 -88
  655. transformers/models/informer/modular_informer.py +16 -14
  656. transformers/models/instructblip/configuration_instructblip.py +2 -2
  657. transformers/models/instructblip/modeling_instructblip.py +63 -103
  658. transformers/models/instructblip/processing_instructblip.py +36 -10
  659. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -2
  660. transformers/models/instructblipvideo/modeling_instructblipvideo.py +139 -157
  661. transformers/models/instructblipvideo/modular_instructblipvideo.py +64 -73
  662. transformers/models/instructblipvideo/processing_instructblipvideo.py +33 -14
  663. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +8 -6
  664. transformers/models/internvl/configuration_internvl.py +1 -0
  665. transformers/models/internvl/modeling_internvl.py +106 -85
  666. transformers/models/internvl/modular_internvl.py +67 -47
  667. transformers/models/internvl/processing_internvl.py +45 -12
  668. transformers/models/internvl/video_processing_internvl.py +12 -10
  669. transformers/models/jamba/configuration_jamba.py +8 -5
  670. transformers/models/jamba/modeling_jamba.py +66 -68
  671. transformers/models/jamba/modular_jamba.py +55 -54
  672. transformers/models/janus/configuration_janus.py +1 -0
  673. transformers/models/janus/image_processing_janus.py +37 -35
  674. transformers/models/janus/image_processing_janus_fast.py +20 -18
  675. transformers/models/janus/modeling_janus.py +191 -115
  676. transformers/models/janus/modular_janus.py +84 -133
  677. transformers/models/janus/processing_janus.py +43 -17
  678. transformers/models/jetmoe/configuration_jetmoe.py +26 -24
  679. transformers/models/jetmoe/modeling_jetmoe.py +46 -43
  680. transformers/models/jetmoe/modular_jetmoe.py +33 -31
  681. transformers/models/kosmos2/configuration_kosmos2.py +9 -10
  682. transformers/models/kosmos2/modeling_kosmos2.py +173 -208
  683. transformers/models/kosmos2/processing_kosmos2.py +55 -40
  684. transformers/models/kosmos2_5/__init__.py +1 -0
  685. transformers/models/kosmos2_5/configuration_kosmos2_5.py +9 -8
  686. transformers/models/kosmos2_5/image_processing_kosmos2_5.py +12 -10
  687. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +13 -4
  688. transformers/models/kosmos2_5/modeling_kosmos2_5.py +118 -132
  689. transformers/models/kosmos2_5/processing_kosmos2_5.py +29 -8
  690. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +28 -31
  691. transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +14 -12
  692. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +100 -110
  693. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +22 -28
  694. transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +8 -2
  695. transformers/models/layoutlm/configuration_layoutlm.py +2 -14
  696. transformers/models/layoutlm/modeling_layoutlm.py +72 -77
  697. transformers/models/layoutlmv2/configuration_layoutlmv2.py +17 -14
  698. transformers/models/layoutlmv2/image_processing_layoutlmv2.py +21 -18
  699. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +9 -7
  700. transformers/models/layoutlmv2/modeling_layoutlmv2.py +50 -64
  701. transformers/models/layoutlmv2/processing_layoutlmv2.py +44 -14
  702. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +126 -73
  703. transformers/models/layoutlmv3/configuration_layoutlmv3.py +19 -16
  704. transformers/models/layoutlmv3/image_processing_layoutlmv3.py +26 -24
  705. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +11 -9
  706. transformers/models/layoutlmv3/modeling_layoutlmv3.py +56 -82
  707. transformers/models/layoutlmv3/processing_layoutlmv3.py +46 -14
  708. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +134 -74
  709. transformers/models/layoutxlm/configuration_layoutxlm.py +17 -14
  710. transformers/models/layoutxlm/modular_layoutxlm.py +1 -0
  711. transformers/models/layoutxlm/processing_layoutxlm.py +44 -14
  712. transformers/models/layoutxlm/tokenization_layoutxlm.py +113 -77
  713. transformers/models/led/configuration_led.py +12 -8
  714. transformers/models/led/modeling_led.py +266 -124
  715. transformers/models/levit/configuration_levit.py +1 -0
  716. transformers/models/levit/image_processing_levit.py +21 -19
  717. transformers/models/levit/image_processing_levit_fast.py +5 -4
  718. transformers/models/levit/modeling_levit.py +19 -38
  719. transformers/models/lfm2/configuration_lfm2.py +30 -27
  720. transformers/models/lfm2/modeling_lfm2.py +50 -47
  721. transformers/models/lfm2/modular_lfm2.py +30 -29
  722. transformers/models/lfm2_moe/__init__.py +1 -0
  723. transformers/models/lfm2_moe/configuration_lfm2_moe.py +9 -6
  724. transformers/models/lfm2_moe/modeling_lfm2_moe.py +53 -61
  725. transformers/models/lfm2_moe/modular_lfm2_moe.py +37 -13
  726. transformers/models/lfm2_vl/configuration_lfm2_vl.py +1 -4
  727. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +12 -41
  728. transformers/models/lfm2_vl/modeling_lfm2_vl.py +66 -84
  729. transformers/models/lfm2_vl/modular_lfm2_vl.py +56 -70
  730. transformers/models/lfm2_vl/processing_lfm2_vl.py +76 -96
  731. transformers/models/lightglue/image_processing_lightglue.py +15 -16
  732. transformers/models/lightglue/image_processing_lightglue_fast.py +9 -9
  733. transformers/models/lightglue/modeling_lightglue.py +31 -31
  734. transformers/models/lightglue/modular_lightglue.py +28 -29
  735. transformers/models/lilt/configuration_lilt.py +2 -6
  736. transformers/models/lilt/modeling_lilt.py +70 -76
  737. transformers/models/llama/configuration_llama.py +31 -26
  738. transformers/models/llama/modeling_llama.py +39 -36
  739. transformers/models/llama/tokenization_llama.py +44 -14
  740. transformers/models/llama4/configuration_llama4.py +30 -27
  741. transformers/models/llama4/image_processing_llama4_fast.py +14 -12
  742. transformers/models/llama4/modeling_llama4.py +113 -120
  743. transformers/models/llama4/processing_llama4.py +57 -33
  744. transformers/models/llava/configuration_llava.py +1 -10
  745. transformers/models/llava/image_processing_llava.py +28 -25
  746. transformers/models/llava/image_processing_llava_fast.py +11 -9
  747. transformers/models/llava/modeling_llava.py +109 -85
  748. transformers/models/llava/processing_llava.py +51 -18
  749. transformers/models/llava_next/configuration_llava_next.py +2 -2
  750. transformers/models/llava_next/image_processing_llava_next.py +45 -43
  751. transformers/models/llava_next/image_processing_llava_next_fast.py +13 -11
  752. transformers/models/llava_next/modeling_llava_next.py +107 -110
  753. transformers/models/llava_next/processing_llava_next.py +47 -18
  754. transformers/models/llava_next_video/configuration_llava_next_video.py +7 -4
  755. transformers/models/llava_next_video/modeling_llava_next_video.py +158 -175
  756. transformers/models/llava_next_video/modular_llava_next_video.py +150 -155
  757. transformers/models/llava_next_video/processing_llava_next_video.py +63 -21
  758. transformers/models/llava_next_video/video_processing_llava_next_video.py +1 -0
  759. transformers/models/llava_onevision/configuration_llava_onevision.py +7 -4
  760. transformers/models/llava_onevision/image_processing_llava_onevision.py +42 -40
  761. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +15 -14
  762. transformers/models/llava_onevision/modeling_llava_onevision.py +169 -177
  763. transformers/models/llava_onevision/modular_llava_onevision.py +156 -163
  764. transformers/models/llava_onevision/processing_llava_onevision.py +53 -21
  765. transformers/models/llava_onevision/video_processing_llava_onevision.py +1 -0
  766. transformers/models/longcat_flash/__init__.py +1 -0
  767. transformers/models/longcat_flash/configuration_longcat_flash.py +42 -37
  768. transformers/models/longcat_flash/modeling_longcat_flash.py +36 -36
  769. transformers/models/longcat_flash/modular_longcat_flash.py +21 -21
  770. transformers/models/longformer/configuration_longformer.py +5 -5
  771. transformers/models/longformer/modeling_longformer.py +101 -105
  772. transformers/models/longt5/configuration_longt5.py +7 -9
  773. transformers/models/longt5/modeling_longt5.py +49 -49
  774. transformers/models/luke/configuration_luke.py +2 -8
  775. transformers/models/luke/modeling_luke.py +181 -188
  776. transformers/models/luke/tokenization_luke.py +140 -107
  777. transformers/models/lxmert/configuration_lxmert.py +1 -16
  778. transformers/models/lxmert/modeling_lxmert.py +74 -65
  779. transformers/models/m2m_100/configuration_m2m_100.py +9 -7
  780. transformers/models/m2m_100/modeling_m2m_100.py +71 -83
  781. transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
  782. transformers/models/mamba/configuration_mamba.py +2 -1
  783. transformers/models/mamba/modeling_mamba.py +66 -58
  784. transformers/models/mamba2/configuration_mamba2.py +8 -5
  785. transformers/models/mamba2/modeling_mamba2.py +69 -68
  786. transformers/models/marian/configuration_marian.py +5 -10
  787. transformers/models/marian/modeling_marian.py +87 -93
  788. transformers/models/marian/tokenization_marian.py +6 -6
  789. transformers/models/markuplm/configuration_markuplm.py +7 -4
  790. transformers/models/markuplm/feature_extraction_markuplm.py +2 -1
  791. transformers/models/markuplm/modeling_markuplm.py +70 -69
  792. transformers/models/markuplm/processing_markuplm.py +38 -31
  793. transformers/models/markuplm/tokenization_markuplm.py +136 -93
  794. transformers/models/mask2former/configuration_mask2former.py +8 -5
  795. transformers/models/mask2former/image_processing_mask2former.py +85 -84
  796. transformers/models/mask2former/image_processing_mask2former_fast.py +40 -37
  797. transformers/models/mask2former/modeling_mask2former.py +103 -118
  798. transformers/models/mask2former/modular_mask2former.py +8 -6
  799. transformers/models/maskformer/configuration_maskformer.py +9 -6
  800. transformers/models/maskformer/configuration_maskformer_swin.py +1 -0
  801. transformers/models/maskformer/image_processing_maskformer.py +85 -84
  802. transformers/models/maskformer/image_processing_maskformer_fast.py +40 -36
  803. transformers/models/maskformer/modeling_maskformer.py +65 -79
  804. transformers/models/maskformer/modeling_maskformer_swin.py +32 -36
  805. transformers/models/mbart/configuration_mbart.py +4 -9
  806. transformers/models/mbart/modeling_mbart.py +116 -131
  807. transformers/models/mbart/tokenization_mbart.py +54 -11
  808. transformers/models/mbart50/tokenization_mbart50.py +13 -8
  809. transformers/models/megatron_bert/configuration_megatron_bert.py +3 -13
  810. transformers/models/megatron_bert/modeling_megatron_bert.py +150 -148
  811. transformers/models/metaclip_2/configuration_metaclip_2.py +1 -4
  812. transformers/models/metaclip_2/modeling_metaclip_2.py +84 -91
  813. transformers/models/metaclip_2/modular_metaclip_2.py +45 -61
  814. transformers/models/mgp_str/configuration_mgp_str.py +1 -0
  815. transformers/models/mgp_str/modeling_mgp_str.py +18 -20
  816. transformers/models/mgp_str/processing_mgp_str.py +20 -3
  817. transformers/models/mgp_str/tokenization_mgp_str.py +3 -1
  818. transformers/models/mimi/configuration_mimi.py +40 -42
  819. transformers/models/mimi/modeling_mimi.py +113 -142
  820. transformers/models/minimax/__init__.py +1 -0
  821. transformers/models/minimax/configuration_minimax.py +43 -37
  822. transformers/models/minimax/modeling_minimax.py +51 -61
  823. transformers/models/minimax/modular_minimax.py +62 -68
  824. transformers/models/ministral/configuration_ministral.py +29 -25
  825. transformers/models/ministral/modeling_ministral.py +38 -36
  826. transformers/models/ministral/modular_ministral.py +37 -32
  827. transformers/models/ministral3/configuration_ministral3.py +27 -24
  828. transformers/models/ministral3/modeling_ministral3.py +37 -36
  829. transformers/models/ministral3/modular_ministral3.py +5 -4
  830. transformers/models/mistral/configuration_mistral.py +29 -24
  831. transformers/models/mistral/modeling_mistral.py +37 -36
  832. transformers/models/mistral/modular_mistral.py +12 -11
  833. transformers/models/mistral3/configuration_mistral3.py +1 -4
  834. transformers/models/mistral3/modeling_mistral3.py +86 -89
  835. transformers/models/mistral3/modular_mistral3.py +68 -69
  836. transformers/models/mixtral/configuration_mixtral.py +34 -29
  837. transformers/models/mixtral/modeling_mixtral.py +45 -50
  838. transformers/models/mixtral/modular_mixtral.py +31 -32
  839. transformers/models/mlcd/configuration_mlcd.py +1 -0
  840. transformers/models/mlcd/modeling_mlcd.py +14 -20
  841. transformers/models/mlcd/modular_mlcd.py +13 -17
  842. transformers/models/mllama/configuration_mllama.py +15 -10
  843. transformers/models/mllama/image_processing_mllama.py +25 -23
  844. transformers/models/mllama/image_processing_mllama_fast.py +11 -11
  845. transformers/models/mllama/modeling_mllama.py +94 -105
  846. transformers/models/mllama/processing_mllama.py +55 -6
  847. transformers/models/mluke/tokenization_mluke.py +107 -101
  848. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +3 -5
  849. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +140 -155
  850. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +3 -5
  851. transformers/models/mobilebert/configuration_mobilebert.py +2 -4
  852. transformers/models/mobilebert/modeling_mobilebert.py +85 -77
  853. transformers/models/mobilebert/tokenization_mobilebert.py +1 -0
  854. transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +1 -0
  855. transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +23 -20
  856. transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +1 -0
  857. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +16 -15
  858. transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +1 -0
  859. transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +51 -48
  860. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +15 -13
  861. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +22 -24
  862. transformers/models/mobilevit/configuration_mobilevit.py +1 -0
  863. transformers/models/mobilevit/image_processing_mobilevit.py +49 -46
  864. transformers/models/mobilevit/image_processing_mobilevit_fast.py +14 -12
  865. transformers/models/mobilevit/modeling_mobilevit.py +21 -28
  866. transformers/models/mobilevitv2/configuration_mobilevitv2.py +1 -0
  867. transformers/models/mobilevitv2/modeling_mobilevitv2.py +22 -28
  868. transformers/models/modernbert/configuration_modernbert.py +42 -44
  869. transformers/models/modernbert/modeling_modernbert.py +133 -145
  870. transformers/models/modernbert/modular_modernbert.py +170 -186
  871. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +40 -40
  872. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +57 -62
  873. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +86 -94
  874. transformers/models/moonshine/configuration_moonshine.py +31 -34
  875. transformers/models/moonshine/modeling_moonshine.py +71 -71
  876. transformers/models/moonshine/modular_moonshine.py +83 -88
  877. transformers/models/moshi/configuration_moshi.py +23 -46
  878. transformers/models/moshi/modeling_moshi.py +187 -157
  879. transformers/models/mpnet/configuration_mpnet.py +2 -6
  880. transformers/models/mpnet/modeling_mpnet.py +57 -62
  881. transformers/models/mpnet/tokenization_mpnet.py +15 -4
  882. transformers/models/mpt/configuration_mpt.py +9 -5
  883. transformers/models/mpt/modeling_mpt.py +60 -60
  884. transformers/models/mra/configuration_mra.py +2 -8
  885. transformers/models/mra/modeling_mra.py +57 -64
  886. transformers/models/mt5/configuration_mt5.py +8 -10
  887. transformers/models/mt5/modeling_mt5.py +95 -87
  888. transformers/models/musicgen/configuration_musicgen.py +8 -12
  889. transformers/models/musicgen/modeling_musicgen.py +122 -118
  890. transformers/models/musicgen/processing_musicgen.py +21 -3
  891. transformers/models/musicgen_melody/configuration_musicgen_melody.py +8 -15
  892. transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +9 -8
  893. transformers/models/musicgen_melody/modeling_musicgen_melody.py +123 -117
  894. transformers/models/musicgen_melody/processing_musicgen_melody.py +22 -3
  895. transformers/models/mvp/configuration_mvp.py +5 -8
  896. transformers/models/mvp/modeling_mvp.py +123 -135
  897. transformers/models/myt5/tokenization_myt5.py +10 -8
  898. transformers/models/nanochat/configuration_nanochat.py +8 -5
  899. transformers/models/nanochat/modeling_nanochat.py +40 -37
  900. transformers/models/nanochat/modular_nanochat.py +14 -12
  901. transformers/models/nemotron/configuration_nemotron.py +30 -25
  902. transformers/models/nemotron/modeling_nemotron.py +57 -56
  903. transformers/models/nllb/tokenization_nllb.py +28 -12
  904. transformers/models/nllb_moe/configuration_nllb_moe.py +9 -7
  905. transformers/models/nllb_moe/modeling_nllb_moe.py +69 -77
  906. transformers/models/nougat/image_processing_nougat.py +32 -29
  907. transformers/models/nougat/image_processing_nougat_fast.py +14 -12
  908. transformers/models/nougat/processing_nougat.py +39 -37
  909. transformers/models/nougat/tokenization_nougat.py +73 -18
  910. transformers/models/nystromformer/configuration_nystromformer.py +2 -8
  911. transformers/models/nystromformer/modeling_nystromformer.py +63 -74
  912. transformers/models/olmo/configuration_olmo.py +28 -23
  913. transformers/models/olmo/modeling_olmo.py +39 -36
  914. transformers/models/olmo/modular_olmo.py +11 -7
  915. transformers/models/olmo2/configuration_olmo2.py +28 -23
  916. transformers/models/olmo2/modeling_olmo2.py +41 -37
  917. transformers/models/olmo2/modular_olmo2.py +32 -29
  918. transformers/models/olmo3/__init__.py +1 -0
  919. transformers/models/olmo3/configuration_olmo3.py +30 -26
  920. transformers/models/olmo3/modeling_olmo3.py +39 -36
  921. transformers/models/olmo3/modular_olmo3.py +40 -37
  922. transformers/models/olmoe/configuration_olmoe.py +33 -29
  923. transformers/models/olmoe/modeling_olmoe.py +46 -52
  924. transformers/models/olmoe/modular_olmoe.py +15 -16
  925. transformers/models/omdet_turbo/configuration_omdet_turbo.py +4 -2
  926. transformers/models/omdet_turbo/modeling_omdet_turbo.py +47 -53
  927. transformers/models/omdet_turbo/processing_omdet_turbo.py +67 -19
  928. transformers/models/oneformer/configuration_oneformer.py +8 -5
  929. transformers/models/oneformer/image_processing_oneformer.py +84 -83
  930. transformers/models/oneformer/image_processing_oneformer_fast.py +42 -41
  931. transformers/models/oneformer/modeling_oneformer.py +171 -147
  932. transformers/models/oneformer/processing_oneformer.py +43 -28
  933. transformers/models/openai/configuration_openai.py +1 -16
  934. transformers/models/openai/modeling_openai.py +51 -65
  935. transformers/models/openai/tokenization_openai.py +47 -8
  936. transformers/models/opt/configuration_opt.py +7 -6
  937. transformers/models/opt/modeling_opt.py +76 -78
  938. transformers/models/ovis2/__init__.py +1 -0
  939. transformers/models/ovis2/configuration_ovis2.py +1 -0
  940. transformers/models/ovis2/image_processing_ovis2.py +24 -22
  941. transformers/models/ovis2/image_processing_ovis2_fast.py +11 -9
  942. transformers/models/ovis2/modeling_ovis2.py +142 -111
  943. transformers/models/ovis2/modular_ovis2.py +45 -90
  944. transformers/models/ovis2/processing_ovis2.py +40 -12
  945. transformers/models/owlv2/configuration_owlv2.py +2 -4
  946. transformers/models/owlv2/image_processing_owlv2.py +21 -20
  947. transformers/models/owlv2/image_processing_owlv2_fast.py +15 -12
  948. transformers/models/owlv2/modeling_owlv2.py +117 -133
  949. transformers/models/owlv2/modular_owlv2.py +14 -11
  950. transformers/models/owlv2/processing_owlv2.py +49 -20
  951. transformers/models/owlvit/configuration_owlvit.py +2 -4
  952. transformers/models/owlvit/image_processing_owlvit.py +22 -21
  953. transformers/models/owlvit/image_processing_owlvit_fast.py +3 -2
  954. transformers/models/owlvit/modeling_owlvit.py +116 -132
  955. transformers/models/owlvit/processing_owlvit.py +48 -20
  956. transformers/models/paligemma/configuration_paligemma.py +1 -4
  957. transformers/models/paligemma/modeling_paligemma.py +93 -103
  958. transformers/models/paligemma/processing_paligemma.py +66 -13
  959. transformers/models/parakeet/configuration_parakeet.py +14 -7
  960. transformers/models/parakeet/feature_extraction_parakeet.py +12 -10
  961. transformers/models/parakeet/modeling_parakeet.py +28 -32
  962. transformers/models/parakeet/modular_parakeet.py +20 -23
  963. transformers/models/parakeet/processing_parakeet.py +5 -13
  964. transformers/models/parakeet/{tokenization_parakeet.py → tokenization_parakeet_fast.py} +7 -5
  965. transformers/models/patchtsmixer/configuration_patchtsmixer.py +8 -5
  966. transformers/models/patchtsmixer/modeling_patchtsmixer.py +62 -70
  967. transformers/models/patchtst/configuration_patchtst.py +9 -6
  968. transformers/models/patchtst/modeling_patchtst.py +80 -97
  969. transformers/models/pegasus/configuration_pegasus.py +5 -8
  970. transformers/models/pegasus/modeling_pegasus.py +66 -72
  971. transformers/models/pegasus/tokenization_pegasus.py +45 -15
  972. transformers/models/pegasus_x/configuration_pegasus_x.py +4 -5
  973. transformers/models/pegasus_x/modeling_pegasus_x.py +52 -55
  974. transformers/models/perceiver/configuration_perceiver.py +1 -0
  975. transformers/models/perceiver/image_processing_perceiver.py +25 -22
  976. transformers/models/perceiver/image_processing_perceiver_fast.py +9 -7
  977. transformers/models/perceiver/modeling_perceiver.py +146 -165
  978. transformers/models/perceiver/tokenization_perceiver.py +6 -3
  979. transformers/models/perception_lm/configuration_perception_lm.py +1 -0
  980. transformers/models/perception_lm/image_processing_perception_lm_fast.py +10 -8
  981. transformers/models/perception_lm/modeling_perception_lm.py +70 -71
  982. transformers/models/perception_lm/modular_perception_lm.py +61 -65
  983. transformers/models/perception_lm/processing_perception_lm.py +47 -13
  984. transformers/models/perception_lm/video_processing_perception_lm.py +1 -0
  985. transformers/models/persimmon/configuration_persimmon.py +28 -23
  986. transformers/models/persimmon/modeling_persimmon.py +45 -43
  987. transformers/models/phi/configuration_phi.py +28 -23
  988. transformers/models/phi/modeling_phi.py +43 -40
  989. transformers/models/phi/modular_phi.py +24 -23
  990. transformers/models/phi3/configuration_phi3.py +33 -28
  991. transformers/models/phi3/modeling_phi3.py +38 -36
  992. transformers/models/phi3/modular_phi3.py +17 -13
  993. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +33 -30
  994. transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +9 -7
  995. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +11 -11
  996. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +78 -95
  997. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +80 -98
  998. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +44 -7
  999. transformers/models/phimoe/configuration_phimoe.py +36 -31
  1000. transformers/models/phimoe/modeling_phimoe.py +45 -50
  1001. transformers/models/phimoe/modular_phimoe.py +4 -3
  1002. transformers/models/phobert/tokenization_phobert.py +6 -4
  1003. transformers/models/pix2struct/configuration_pix2struct.py +10 -12
  1004. transformers/models/pix2struct/image_processing_pix2struct.py +19 -15
  1005. transformers/models/pix2struct/image_processing_pix2struct_fast.py +15 -12
  1006. transformers/models/pix2struct/modeling_pix2struct.py +52 -58
  1007. transformers/models/pix2struct/processing_pix2struct.py +30 -5
  1008. transformers/models/pixtral/configuration_pixtral.py +14 -11
  1009. transformers/models/pixtral/image_processing_pixtral.py +28 -26
  1010. transformers/models/pixtral/image_processing_pixtral_fast.py +11 -10
  1011. transformers/models/pixtral/modeling_pixtral.py +34 -28
  1012. transformers/models/pixtral/processing_pixtral.py +53 -21
  1013. transformers/models/plbart/configuration_plbart.py +5 -8
  1014. transformers/models/plbart/modeling_plbart.py +106 -119
  1015. transformers/models/plbart/modular_plbart.py +33 -39
  1016. transformers/models/plbart/tokenization_plbart.py +7 -4
  1017. transformers/models/poolformer/configuration_poolformer.py +1 -0
  1018. transformers/models/poolformer/image_processing_poolformer.py +24 -21
  1019. transformers/models/poolformer/image_processing_poolformer_fast.py +15 -13
  1020. transformers/models/poolformer/modeling_poolformer.py +13 -23
  1021. transformers/models/pop2piano/configuration_pop2piano.py +8 -7
  1022. transformers/models/pop2piano/feature_extraction_pop2piano.py +9 -6
  1023. transformers/models/pop2piano/modeling_pop2piano.py +24 -26
  1024. transformers/models/pop2piano/processing_pop2piano.py +33 -25
  1025. transformers/models/pop2piano/tokenization_pop2piano.py +23 -15
  1026. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +3 -3
  1027. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
  1028. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +21 -20
  1029. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +13 -16
  1030. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +13 -16
  1031. transformers/models/prophetnet/configuration_prophetnet.py +38 -37
  1032. transformers/models/prophetnet/modeling_prophetnet.py +131 -114
  1033. transformers/models/prophetnet/tokenization_prophetnet.py +16 -14
  1034. transformers/models/pvt/configuration_pvt.py +1 -0
  1035. transformers/models/pvt/image_processing_pvt.py +27 -24
  1036. transformers/models/pvt/image_processing_pvt_fast.py +2 -1
  1037. transformers/models/pvt/modeling_pvt.py +21 -21
  1038. transformers/models/pvt_v2/configuration_pvt_v2.py +4 -2
  1039. transformers/models/pvt_v2/modeling_pvt_v2.py +25 -28
  1040. transformers/models/qwen2/configuration_qwen2.py +25 -32
  1041. transformers/models/qwen2/modeling_qwen2.py +38 -36
  1042. transformers/models/qwen2/modular_qwen2.py +12 -11
  1043. transformers/models/qwen2/tokenization_qwen2.py +23 -12
  1044. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +26 -32
  1045. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +277 -340
  1046. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +211 -278
  1047. transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +49 -41
  1048. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +35 -29
  1049. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +148 -203
  1050. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +118 -93
  1051. transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +43 -7
  1052. transformers/models/qwen2_audio/configuration_qwen2_audio.py +1 -0
  1053. transformers/models/qwen2_audio/modeling_qwen2_audio.py +40 -40
  1054. transformers/models/qwen2_audio/processing_qwen2_audio.py +42 -13
  1055. transformers/models/qwen2_moe/configuration_qwen2_moe.py +35 -42
  1056. transformers/models/qwen2_moe/modeling_qwen2_moe.py +46 -51
  1057. transformers/models/qwen2_moe/modular_qwen2_moe.py +10 -7
  1058. transformers/models/qwen2_vl/configuration_qwen2_vl.py +34 -29
  1059. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +42 -41
  1060. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +15 -12
  1061. transformers/models/qwen2_vl/modeling_qwen2_vl.py +153 -199
  1062. transformers/models/qwen2_vl/processing_qwen2_vl.py +44 -7
  1063. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +18 -38
  1064. transformers/models/qwen3/configuration_qwen3.py +27 -34
  1065. transformers/models/qwen3/modeling_qwen3.py +39 -36
  1066. transformers/models/qwen3/modular_qwen3.py +6 -4
  1067. transformers/models/qwen3_moe/configuration_qwen3_moe.py +32 -39
  1068. transformers/models/qwen3_moe/modeling_qwen3_moe.py +46 -51
  1069. transformers/models/qwen3_moe/modular_qwen3_moe.py +13 -10
  1070. transformers/models/qwen3_next/configuration_qwen3_next.py +35 -45
  1071. transformers/models/qwen3_next/modeling_qwen3_next.py +51 -47
  1072. transformers/models/qwen3_next/modular_qwen3_next.py +35 -34
  1073. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +101 -135
  1074. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +252 -355
  1075. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +196 -250
  1076. transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +48 -40
  1077. transformers/models/qwen3_vl/configuration_qwen3_vl.py +29 -27
  1078. transformers/models/qwen3_vl/modeling_qwen3_vl.py +155 -233
  1079. transformers/models/qwen3_vl/modular_qwen3_vl.py +179 -206
  1080. transformers/models/qwen3_vl/processing_qwen3_vl.py +42 -6
  1081. transformers/models/qwen3_vl/video_processing_qwen3_vl.py +12 -10
  1082. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +30 -23
  1083. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +303 -358
  1084. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +124 -87
  1085. transformers/models/rag/configuration_rag.py +15 -6
  1086. transformers/models/rag/modeling_rag.py +130 -127
  1087. transformers/models/rag/retrieval_rag.py +5 -3
  1088. transformers/models/rag/tokenization_rag.py +50 -0
  1089. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +30 -29
  1090. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +42 -53
  1091. transformers/models/reformer/configuration_reformer.py +8 -7
  1092. transformers/models/reformer/modeling_reformer.py +69 -80
  1093. transformers/models/reformer/tokenization_reformer.py +31 -11
  1094. transformers/models/regnet/configuration_regnet.py +1 -0
  1095. transformers/models/regnet/modeling_regnet.py +8 -15
  1096. transformers/models/rembert/configuration_rembert.py +2 -8
  1097. transformers/models/rembert/modeling_rembert.py +111 -121
  1098. transformers/models/rembert/tokenization_rembert.py +12 -2
  1099. transformers/models/resnet/configuration_resnet.py +1 -0
  1100. transformers/models/resnet/modeling_resnet.py +13 -27
  1101. transformers/models/roberta/configuration_roberta.py +3 -11
  1102. transformers/models/roberta/modeling_roberta.py +93 -94
  1103. transformers/models/roberta/modular_roberta.py +58 -58
  1104. transformers/models/roberta/tokenization_roberta.py +29 -17
  1105. transformers/models/roberta/tokenization_roberta_old.py +4 -2
  1106. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +3 -11
  1107. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +93 -94
  1108. transformers/models/roc_bert/configuration_roc_bert.py +2 -8
  1109. transformers/models/roc_bert/modeling_roc_bert.py +121 -122
  1110. transformers/models/roc_bert/tokenization_roc_bert.py +94 -88
  1111. transformers/models/roformer/configuration_roformer.py +3 -13
  1112. transformers/models/roformer/modeling_roformer.py +81 -85
  1113. transformers/models/roformer/tokenization_roformer.py +412 -74
  1114. transformers/models/roformer/tokenization_roformer_fast.py +160 -0
  1115. transformers/models/roformer/tokenization_utils.py +1 -0
  1116. transformers/models/rt_detr/configuration_rt_detr.py +2 -1
  1117. transformers/models/rt_detr/configuration_rt_detr_resnet.py +1 -0
  1118. transformers/models/rt_detr/image_processing_rt_detr.py +55 -54
  1119. transformers/models/rt_detr/image_processing_rt_detr_fast.py +26 -26
  1120. transformers/models/rt_detr/modeling_rt_detr.py +90 -99
  1121. transformers/models/rt_detr/modeling_rt_detr_resnet.py +6 -13
  1122. transformers/models/rt_detr/modular_rt_detr.py +16 -16
  1123. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +4 -6
  1124. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +90 -101
  1125. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +12 -19
  1126. transformers/models/rwkv/configuration_rwkv.py +4 -2
  1127. transformers/models/rwkv/modeling_rwkv.py +32 -31
  1128. transformers/models/sam/configuration_sam.py +1 -3
  1129. transformers/models/sam/image_processing_sam.py +60 -59
  1130. transformers/models/sam/image_processing_sam_fast.py +27 -25
  1131. transformers/models/sam/modeling_sam.py +41 -47
  1132. transformers/models/sam/processing_sam.py +27 -39
  1133. transformers/models/sam2/configuration_sam2.py +3 -2
  1134. transformers/models/sam2/image_processing_sam2_fast.py +15 -14
  1135. transformers/models/sam2/modeling_sam2.py +90 -96
  1136. transformers/models/sam2/modular_sam2.py +91 -86
  1137. transformers/models/sam2/processing_sam2.py +47 -31
  1138. transformers/models/sam2_video/configuration_sam2_video.py +1 -0
  1139. transformers/models/sam2_video/modeling_sam2_video.py +144 -151
  1140. transformers/models/sam2_video/modular_sam2_video.py +104 -101
  1141. transformers/models/sam2_video/processing_sam2_video.py +66 -49
  1142. transformers/models/sam2_video/video_processing_sam2_video.py +4 -1
  1143. transformers/models/sam3/configuration_sam3.py +2 -21
  1144. transformers/models/sam3/image_processing_sam3_fast.py +20 -17
  1145. transformers/models/sam3/modeling_sam3.py +170 -184
  1146. transformers/models/sam3/modular_sam3.py +8 -3
  1147. transformers/models/sam3/processing_sam3.py +52 -37
  1148. transformers/models/sam3_tracker/__init__.py +1 -0
  1149. transformers/models/sam3_tracker/configuration_sam3_tracker.py +3 -1
  1150. transformers/models/sam3_tracker/modeling_sam3_tracker.py +77 -82
  1151. transformers/models/sam3_tracker/modular_sam3_tracker.py +3 -8
  1152. transformers/models/sam3_tracker/processing_sam3_tracker.py +48 -31
  1153. transformers/models/sam3_tracker_video/__init__.py +1 -0
  1154. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +1 -25
  1155. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +122 -135
  1156. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +26 -35
  1157. transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +66 -50
  1158. transformers/models/sam3_video/configuration_sam3_video.py +1 -14
  1159. transformers/models/sam3_video/modeling_sam3_video.py +34 -33
  1160. transformers/models/sam3_video/processing_sam3_video.py +46 -26
  1161. transformers/models/sam_hq/__init__.py +1 -1
  1162. transformers/models/sam_hq/configuration_sam_hq.py +1 -3
  1163. transformers/models/sam_hq/modeling_sam_hq.py +69 -74
  1164. transformers/models/sam_hq/modular_sam_hq.py +25 -23
  1165. transformers/models/sam_hq/{processing_sam_hq.py → processing_samhq.py} +29 -41
  1166. transformers/models/seamless_m4t/configuration_seamless_m4t.py +10 -8
  1167. transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +11 -8
  1168. transformers/models/seamless_m4t/modeling_seamless_m4t.py +194 -212
  1169. transformers/models/seamless_m4t/processing_seamless_m4t.py +39 -18
  1170. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +77 -40
  1171. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +10 -8
  1172. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +196 -204
  1173. transformers/models/seed_oss/configuration_seed_oss.py +32 -28
  1174. transformers/models/seed_oss/modeling_seed_oss.py +35 -33
  1175. transformers/models/seed_oss/modular_seed_oss.py +4 -3
  1176. transformers/models/segformer/configuration_segformer.py +10 -0
  1177. transformers/models/segformer/image_processing_segformer.py +42 -39
  1178. transformers/models/segformer/image_processing_segformer_fast.py +12 -10
  1179. transformers/models/segformer/modeling_segformer.py +31 -34
  1180. transformers/models/segformer/modular_segformer.py +10 -8
  1181. transformers/models/seggpt/configuration_seggpt.py +1 -0
  1182. transformers/models/seggpt/image_processing_seggpt.py +41 -38
  1183. transformers/models/seggpt/modeling_seggpt.py +38 -50
  1184. transformers/models/sew/configuration_sew.py +2 -4
  1185. transformers/models/sew/modeling_sew.py +36 -38
  1186. transformers/models/sew/modular_sew.py +13 -13
  1187. transformers/models/sew_d/configuration_sew_d.py +2 -4
  1188. transformers/models/sew_d/modeling_sew_d.py +30 -31
  1189. transformers/models/shieldgemma2/configuration_shieldgemma2.py +1 -0
  1190. transformers/models/shieldgemma2/modeling_shieldgemma2.py +17 -16
  1191. transformers/models/shieldgemma2/processing_shieldgemma2.py +5 -3
  1192. transformers/models/siglip/configuration_siglip.py +2 -4
  1193. transformers/models/siglip/image_processing_siglip.py +20 -17
  1194. transformers/models/siglip/image_processing_siglip_fast.py +1 -0
  1195. transformers/models/siglip/modeling_siglip.py +75 -84
  1196. transformers/models/siglip/processing_siglip.py +14 -2
  1197. transformers/models/siglip/tokenization_siglip.py +7 -6
  1198. transformers/models/siglip2/configuration_siglip2.py +2 -5
  1199. transformers/models/siglip2/image_processing_siglip2.py +16 -15
  1200. transformers/models/siglip2/image_processing_siglip2_fast.py +7 -6
  1201. transformers/models/siglip2/modeling_siglip2.py +129 -143
  1202. transformers/models/siglip2/modular_siglip2.py +46 -47
  1203. transformers/models/siglip2/processing_siglip2.py +14 -2
  1204. transformers/models/smollm3/configuration_smollm3.py +32 -29
  1205. transformers/models/smollm3/modeling_smollm3.py +39 -36
  1206. transformers/models/smollm3/modular_smollm3.py +35 -33
  1207. transformers/models/smolvlm/configuration_smolvlm.py +4 -2
  1208. transformers/models/smolvlm/image_processing_smolvlm.py +43 -42
  1209. transformers/models/smolvlm/image_processing_smolvlm_fast.py +15 -41
  1210. transformers/models/smolvlm/modeling_smolvlm.py +94 -126
  1211. transformers/models/smolvlm/modular_smolvlm.py +39 -50
  1212. transformers/models/smolvlm/processing_smolvlm.py +83 -15
  1213. transformers/models/smolvlm/video_processing_smolvlm.py +18 -16
  1214. transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +1 -0
  1215. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +27 -26
  1216. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -9
  1217. transformers/models/speech_to_text/feature_extraction_speech_to_text.py +13 -10
  1218. transformers/models/speech_to_text/modeling_speech_to_text.py +54 -66
  1219. transformers/models/speech_to_text/processing_speech_to_text.py +30 -4
  1220. transformers/models/speech_to_text/tokenization_speech_to_text.py +6 -5
  1221. transformers/models/speecht5/configuration_speecht5.py +9 -7
  1222. transformers/models/speecht5/feature_extraction_speecht5.py +37 -16
  1223. transformers/models/speecht5/modeling_speecht5.py +175 -213
  1224. transformers/models/speecht5/number_normalizer.py +1 -0
  1225. transformers/models/speecht5/processing_speecht5.py +37 -3
  1226. transformers/models/speecht5/tokenization_speecht5.py +5 -4
  1227. transformers/models/splinter/configuration_splinter.py +7 -6
  1228. transformers/models/splinter/modeling_splinter.py +59 -71
  1229. transformers/models/splinter/tokenization_splinter.py +30 -9
  1230. transformers/models/squeezebert/configuration_squeezebert.py +2 -14
  1231. transformers/models/squeezebert/modeling_squeezebert.py +62 -68
  1232. transformers/models/squeezebert/tokenization_squeezebert.py +1 -0
  1233. transformers/models/stablelm/configuration_stablelm.py +29 -24
  1234. transformers/models/stablelm/modeling_stablelm.py +45 -44
  1235. transformers/models/starcoder2/configuration_starcoder2.py +27 -30
  1236. transformers/models/starcoder2/modeling_starcoder2.py +41 -39
  1237. transformers/models/starcoder2/modular_starcoder2.py +16 -14
  1238. transformers/models/superglue/configuration_superglue.py +3 -7
  1239. transformers/models/superglue/image_processing_superglue.py +15 -15
  1240. transformers/models/superglue/image_processing_superglue_fast.py +10 -9
  1241. transformers/models/superglue/modeling_superglue.py +37 -42
  1242. transformers/models/superpoint/image_processing_superpoint.py +15 -15
  1243. transformers/models/superpoint/image_processing_superpoint_fast.py +11 -8
  1244. transformers/models/superpoint/modeling_superpoint.py +16 -18
  1245. transformers/models/swiftformer/configuration_swiftformer.py +1 -0
  1246. transformers/models/swiftformer/modeling_swiftformer.py +14 -18
  1247. transformers/models/swin/configuration_swin.py +1 -0
  1248. transformers/models/swin/modeling_swin.py +86 -86
  1249. transformers/models/swin2sr/configuration_swin2sr.py +1 -0
  1250. transformers/models/swin2sr/image_processing_swin2sr.py +13 -10
  1251. transformers/models/swin2sr/image_processing_swin2sr_fast.py +8 -4
  1252. transformers/models/swin2sr/modeling_swin2sr.py +63 -81
  1253. transformers/models/swinv2/configuration_swinv2.py +1 -0
  1254. transformers/models/swinv2/modeling_swinv2.py +104 -108
  1255. transformers/models/switch_transformers/configuration_switch_transformers.py +7 -11
  1256. transformers/models/switch_transformers/modeling_switch_transformers.py +44 -37
  1257. transformers/models/switch_transformers/modular_switch_transformers.py +41 -34
  1258. transformers/models/t5/configuration_t5.py +8 -14
  1259. transformers/models/t5/modeling_t5.py +92 -88
  1260. transformers/models/t5/tokenization_t5.py +9 -3
  1261. transformers/models/t5gemma/configuration_t5gemma.py +41 -43
  1262. transformers/models/t5gemma/modeling_t5gemma.py +107 -104
  1263. transformers/models/t5gemma/modular_t5gemma.py +120 -124
  1264. transformers/models/t5gemma2/configuration_t5gemma2.py +120 -80
  1265. transformers/models/t5gemma2/modeling_t5gemma2.py +125 -141
  1266. transformers/models/t5gemma2/modular_t5gemma2.py +104 -393
  1267. transformers/models/table_transformer/configuration_table_transformer.py +2 -1
  1268. transformers/models/table_transformer/modeling_table_transformer.py +49 -51
  1269. transformers/models/tapas/configuration_tapas.py +2 -12
  1270. transformers/models/tapas/modeling_tapas.py +67 -68
  1271. transformers/models/tapas/tokenization_tapas.py +153 -115
  1272. transformers/models/textnet/configuration_textnet.py +1 -0
  1273. transformers/models/textnet/image_processing_textnet.py +25 -22
  1274. transformers/models/textnet/image_processing_textnet_fast.py +10 -8
  1275. transformers/models/textnet/modeling_textnet.py +16 -28
  1276. transformers/models/time_series_transformer/configuration_time_series_transformer.py +8 -5
  1277. transformers/models/time_series_transformer/modeling_time_series_transformer.py +81 -83
  1278. transformers/models/timesfm/configuration_timesfm.py +1 -0
  1279. transformers/models/timesfm/modeling_timesfm.py +22 -33
  1280. transformers/models/timesfm/modular_timesfm.py +21 -32
  1281. transformers/models/timesformer/configuration_timesformer.py +1 -0
  1282. transformers/models/timesformer/modeling_timesformer.py +16 -15
  1283. transformers/models/timm_backbone/configuration_timm_backbone.py +1 -0
  1284. transformers/models/timm_backbone/modeling_timm_backbone.py +15 -17
  1285. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -5
  1286. transformers/models/timm_wrapper/image_processing_timm_wrapper.py +5 -4
  1287. transformers/models/timm_wrapper/modeling_timm_wrapper.py +29 -34
  1288. transformers/models/trocr/configuration_trocr.py +8 -11
  1289. transformers/models/trocr/modeling_trocr.py +44 -45
  1290. transformers/models/trocr/processing_trocr.py +25 -5
  1291. transformers/models/tvp/configuration_tvp.py +2 -5
  1292. transformers/models/tvp/image_processing_tvp.py +52 -50
  1293. transformers/models/tvp/image_processing_tvp_fast.py +15 -15
  1294. transformers/models/tvp/modeling_tvp.py +27 -27
  1295. transformers/models/tvp/processing_tvp.py +14 -2
  1296. transformers/models/udop/configuration_udop.py +7 -16
  1297. transformers/models/udop/modeling_udop.py +73 -71
  1298. transformers/models/udop/processing_udop.py +26 -7
  1299. transformers/models/udop/tokenization_udop.py +105 -84
  1300. transformers/models/umt5/configuration_umt5.py +7 -8
  1301. transformers/models/umt5/modeling_umt5.py +90 -94
  1302. transformers/models/unispeech/configuration_unispeech.py +2 -4
  1303. transformers/models/unispeech/modeling_unispeech.py +49 -51
  1304. transformers/models/unispeech/modular_unispeech.py +22 -22
  1305. transformers/models/unispeech_sat/configuration_unispeech_sat.py +2 -4
  1306. transformers/models/unispeech_sat/modeling_unispeech_sat.py +65 -69
  1307. transformers/models/unispeech_sat/modular_unispeech_sat.py +23 -23
  1308. transformers/models/univnet/feature_extraction_univnet.py +14 -14
  1309. transformers/models/univnet/modeling_univnet.py +8 -8
  1310. transformers/models/upernet/configuration_upernet.py +1 -0
  1311. transformers/models/upernet/modeling_upernet.py +13 -11
  1312. transformers/models/vaultgemma/__init__.py +1 -0
  1313. transformers/models/vaultgemma/configuration_vaultgemma.py +33 -29
  1314. transformers/models/vaultgemma/modeling_vaultgemma.py +41 -39
  1315. transformers/models/vaultgemma/modular_vaultgemma.py +31 -29
  1316. transformers/models/video_llama_3/configuration_video_llama_3.py +0 -4
  1317. transformers/models/video_llama_3/image_processing_video_llama_3.py +42 -43
  1318. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +14 -12
  1319. transformers/models/video_llama_3/modeling_video_llama_3.py +109 -157
  1320. transformers/models/video_llama_3/modular_video_llama_3.py +146 -155
  1321. transformers/models/video_llama_3/processing_video_llama_3.py +39 -5
  1322. transformers/models/video_llama_3/video_processing_video_llama_3.py +23 -42
  1323. transformers/models/video_llava/configuration_video_llava.py +1 -4
  1324. transformers/models/video_llava/image_processing_video_llava.py +38 -35
  1325. transformers/models/video_llava/modeling_video_llava.py +146 -146
  1326. transformers/models/video_llava/processing_video_llava.py +78 -38
  1327. transformers/models/video_llava/video_processing_video_llava.py +1 -0
  1328. transformers/models/videomae/configuration_videomae.py +1 -0
  1329. transformers/models/videomae/image_processing_videomae.py +34 -31
  1330. transformers/models/videomae/modeling_videomae.py +17 -14
  1331. transformers/models/videomae/video_processing_videomae.py +1 -0
  1332. transformers/models/vilt/configuration_vilt.py +4 -6
  1333. transformers/models/vilt/image_processing_vilt.py +30 -29
  1334. transformers/models/vilt/image_processing_vilt_fast.py +16 -15
  1335. transformers/models/vilt/modeling_vilt.py +90 -116
  1336. transformers/models/vilt/processing_vilt.py +14 -2
  1337. transformers/models/vipllava/configuration_vipllava.py +1 -4
  1338. transformers/models/vipllava/modeling_vipllava.py +70 -99
  1339. transformers/models/vipllava/modular_vipllava.py +54 -78
  1340. transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +1 -0
  1341. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +27 -28
  1342. transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +1 -0
  1343. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +41 -46
  1344. transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +16 -2
  1345. transformers/models/visual_bert/configuration_visual_bert.py +2 -6
  1346. transformers/models/visual_bert/modeling_visual_bert.py +92 -98
  1347. transformers/models/vit/configuration_vit.py +1 -0
  1348. transformers/models/vit/image_processing_vit.py +22 -19
  1349. transformers/models/vit/image_processing_vit_fast.py +1 -0
  1350. transformers/models/vit/modeling_vit.py +17 -17
  1351. transformers/models/vit_mae/configuration_vit_mae.py +1 -0
  1352. transformers/models/vit_mae/modeling_vit_mae.py +27 -29
  1353. transformers/models/vit_msn/configuration_vit_msn.py +1 -0
  1354. transformers/models/vit_msn/modeling_vit_msn.py +16 -18
  1355. transformers/models/vitdet/configuration_vitdet.py +1 -0
  1356. transformers/models/vitdet/modeling_vitdet.py +14 -14
  1357. transformers/models/vitmatte/configuration_vitmatte.py +5 -2
  1358. transformers/models/vitmatte/image_processing_vitmatte.py +18 -15
  1359. transformers/models/vitmatte/image_processing_vitmatte_fast.py +18 -16
  1360. transformers/models/vitmatte/modeling_vitmatte.py +11 -14
  1361. transformers/models/vitpose/configuration_vitpose.py +7 -4
  1362. transformers/models/vitpose/image_processing_vitpose.py +25 -24
  1363. transformers/models/vitpose/image_processing_vitpose_fast.py +11 -9
  1364. transformers/models/vitpose/modeling_vitpose.py +14 -14
  1365. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +1 -0
  1366. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +10 -8
  1367. transformers/models/vits/configuration_vits.py +1 -4
  1368. transformers/models/vits/modeling_vits.py +42 -44
  1369. transformers/models/vits/tokenization_vits.py +4 -3
  1370. transformers/models/vivit/configuration_vivit.py +1 -0
  1371. transformers/models/vivit/image_processing_vivit.py +39 -36
  1372. transformers/models/vivit/modeling_vivit.py +8 -6
  1373. transformers/models/vjepa2/__init__.py +1 -0
  1374. transformers/models/vjepa2/configuration_vjepa2.py +1 -0
  1375. transformers/models/vjepa2/modeling_vjepa2.py +32 -31
  1376. transformers/models/vjepa2/video_processing_vjepa2.py +1 -0
  1377. transformers/models/voxtral/__init__.py +1 -0
  1378. transformers/models/voxtral/configuration_voxtral.py +2 -0
  1379. transformers/models/voxtral/modeling_voxtral.py +47 -40
  1380. transformers/models/voxtral/modular_voxtral.py +40 -37
  1381. transformers/models/voxtral/processing_voxtral.py +48 -25
  1382. transformers/models/wav2vec2/configuration_wav2vec2.py +2 -4
  1383. transformers/models/wav2vec2/feature_extraction_wav2vec2.py +10 -7
  1384. transformers/models/wav2vec2/modeling_wav2vec2.py +121 -73
  1385. transformers/models/wav2vec2/processing_wav2vec2.py +35 -6
  1386. transformers/models/wav2vec2/tokenization_wav2vec2.py +332 -20
  1387. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +2 -4
  1388. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +62 -70
  1389. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +48 -57
  1390. transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +35 -6
  1391. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +2 -4
  1392. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +77 -90
  1393. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +30 -37
  1394. transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +17 -16
  1395. transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +55 -36
  1396. transformers/models/wavlm/configuration_wavlm.py +2 -4
  1397. transformers/models/wavlm/modeling_wavlm.py +48 -50
  1398. transformers/models/wavlm/modular_wavlm.py +5 -4
  1399. transformers/models/whisper/configuration_whisper.py +5 -6
  1400. transformers/models/whisper/english_normalizer.py +4 -3
  1401. transformers/models/whisper/feature_extraction_whisper.py +24 -9
  1402. transformers/models/whisper/generation_whisper.py +48 -26
  1403. transformers/models/whisper/modeling_whisper.py +73 -79
  1404. transformers/models/whisper/processing_whisper.py +20 -3
  1405. transformers/models/whisper/tokenization_whisper.py +43 -11
  1406. transformers/models/x_clip/configuration_x_clip.py +2 -4
  1407. transformers/models/x_clip/modeling_x_clip.py +93 -96
  1408. transformers/models/x_clip/processing_x_clip.py +14 -2
  1409. transformers/models/xcodec/configuration_xcodec.py +6 -4
  1410. transformers/models/xcodec/modeling_xcodec.py +17 -20
  1411. transformers/models/xglm/configuration_xglm.py +8 -9
  1412. transformers/models/xglm/modeling_xglm.py +55 -60
  1413. transformers/models/xglm/tokenization_xglm.py +11 -3
  1414. transformers/models/xlm/configuration_xlm.py +8 -10
  1415. transformers/models/xlm/modeling_xlm.py +144 -144
  1416. transformers/models/xlm/tokenization_xlm.py +5 -3
  1417. transformers/models/xlm_roberta/configuration_xlm_roberta.py +3 -11
  1418. transformers/models/xlm_roberta/modeling_xlm_roberta.py +194 -195
  1419. transformers/models/xlm_roberta/modular_xlm_roberta.py +53 -50
  1420. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +18 -8
  1421. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +2 -10
  1422. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +93 -94
  1423. transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +70 -67
  1424. transformers/models/xlnet/configuration_xlnet.py +12 -3
  1425. transformers/models/xlnet/modeling_xlnet.py +163 -152
  1426. transformers/models/xlnet/tokenization_xlnet.py +9 -2
  1427. transformers/models/xlstm/configuration_xlstm.py +12 -8
  1428. transformers/models/xlstm/modeling_xlstm.py +65 -62
  1429. transformers/models/xmod/configuration_xmod.py +3 -11
  1430. transformers/models/xmod/modeling_xmod.py +110 -108
  1431. transformers/models/yolos/configuration_yolos.py +1 -0
  1432. transformers/models/yolos/image_processing_yolos.py +62 -60
  1433. transformers/models/yolos/image_processing_yolos_fast.py +45 -42
  1434. transformers/models/yolos/modeling_yolos.py +16 -16
  1435. transformers/models/yolos/modular_yolos.py +19 -17
  1436. transformers/models/yoso/configuration_yoso.py +2 -8
  1437. transformers/models/yoso/modeling_yoso.py +63 -70
  1438. transformers/models/zamba/configuration_zamba.py +8 -5
  1439. transformers/models/zamba/modeling_zamba.py +78 -81
  1440. transformers/models/zamba2/configuration_zamba2.py +50 -44
  1441. transformers/models/zamba2/modeling_zamba2.py +97 -97
  1442. transformers/models/zamba2/modular_zamba2.py +48 -46
  1443. transformers/models/zoedepth/configuration_zoedepth.py +2 -1
  1444. transformers/models/zoedepth/image_processing_zoedepth.py +29 -28
  1445. transformers/models/zoedepth/image_processing_zoedepth_fast.py +24 -21
  1446. transformers/models/zoedepth/modeling_zoedepth.py +18 -26
  1447. transformers/pipelines/__init__.py +114 -57
  1448. transformers/pipelines/any_to_any.py +22 -14
  1449. transformers/pipelines/audio_utils.py +2 -1
  1450. transformers/pipelines/automatic_speech_recognition.py +12 -20
  1451. transformers/pipelines/base.py +27 -15
  1452. transformers/{models/pe_audio/processing_pe_audio.py → pipelines/deprecated/__init__.py} +3 -10
  1453. transformers/pipelines/deprecated/text2text_generation.py +408 -0
  1454. transformers/pipelines/document_question_answering.py +2 -4
  1455. transformers/pipelines/image_text_to_text.py +1 -0
  1456. transformers/pipelines/image_to_text.py +229 -0
  1457. transformers/pipelines/question_answering.py +44 -5
  1458. transformers/pipelines/text_classification.py +14 -1
  1459. transformers/pipelines/text_generation.py +1 -1
  1460. transformers/pipelines/text_to_audio.py +2 -2
  1461. transformers/pipelines/token_classification.py +22 -1
  1462. transformers/pipelines/video_classification.py +9 -1
  1463. transformers/pipelines/zero_shot_audio_classification.py +1 -0
  1464. transformers/pipelines/zero_shot_classification.py +6 -0
  1465. transformers/pipelines/zero_shot_image_classification.py +7 -0
  1466. transformers/processing_utils.py +145 -230
  1467. transformers/quantizers/auto.py +4 -2
  1468. transformers/quantizers/base.py +173 -53
  1469. transformers/quantizers/quantizer_aqlm.py +23 -2
  1470. transformers/quantizers/quantizer_auto_round.py +12 -2
  1471. transformers/quantizers/quantizer_awq.py +89 -20
  1472. transformers/quantizers/quantizer_bitnet.py +14 -4
  1473. transformers/quantizers/quantizer_bnb_4bit.py +155 -18
  1474. transformers/quantizers/quantizer_bnb_8bit.py +110 -24
  1475. transformers/quantizers/quantizer_compressed_tensors.py +9 -2
  1476. transformers/quantizers/quantizer_eetq.py +74 -16
  1477. transformers/quantizers/quantizer_fbgemm_fp8.py +138 -38
  1478. transformers/quantizers/quantizer_finegrained_fp8.py +113 -26
  1479. transformers/quantizers/quantizer_fp_quant.py +82 -52
  1480. transformers/quantizers/quantizer_gptq.py +28 -8
  1481. transformers/quantizers/quantizer_higgs.py +60 -42
  1482. transformers/quantizers/quantizer_hqq.py +153 -144
  1483. transformers/quantizers/quantizer_mxfp4.py +194 -14
  1484. transformers/quantizers/quantizer_quanto.py +79 -35
  1485. transformers/quantizers/quantizer_quark.py +18 -36
  1486. transformers/quantizers/quantizer_spqr.py +12 -4
  1487. transformers/quantizers/quantizer_torchao.py +325 -50
  1488. transformers/quantizers/quantizer_vptq.py +27 -4
  1489. transformers/quantizers/quantizers_utils.py +0 -20
  1490. transformers/safetensors_conversion.py +3 -9
  1491. transformers/testing_utils.py +82 -326
  1492. transformers/tokenization_mistral_common.py +903 -568
  1493. transformers/tokenization_utils_base.py +340 -220
  1494. transformers/tokenization_utils_sentencepiece.py +6 -5
  1495. transformers/tokenization_utils_tokenizers.py +113 -226
  1496. transformers/trainer.py +53 -60
  1497. transformers/trainer_callback.py +0 -8
  1498. transformers/trainer_seq2seq.py +1 -5
  1499. transformers/trainer_utils.py +1 -1
  1500. transformers/training_args.py +41 -77
  1501. transformers/utils/__init__.py +4 -8
  1502. transformers/utils/attention_visualizer.py +5 -5
  1503. transformers/utils/auto_docstring.py +37 -599
  1504. transformers/utils/doc.py +36 -4
  1505. transformers/utils/dummy_pt_objects.py +42 -0
  1506. transformers/utils/generic.py +28 -111
  1507. transformers/utils/hub.py +15 -5
  1508. transformers/utils/import_utils.py +32 -165
  1509. transformers/utils/kernel_config.py +19 -74
  1510. transformers/utils/loading_report.py +15 -25
  1511. transformers/utils/quantization_config.py +241 -72
  1512. transformers/video_processing_utils.py +39 -41
  1513. transformers/video_utils.py +22 -18
  1514. {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/METADATA +236 -284
  1515. transformers-5.0.0rc0.dist-info/RECORD +1987 -0
  1516. {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/WHEEL +1 -1
  1517. transformers/integrations/moe.py +0 -360
  1518. transformers/integrations/quark.py +0 -53
  1519. transformers/loss/loss_lw_detr.py +0 -356
  1520. transformers/models/ernie4_5_vl_moe/__init__.py +0 -31
  1521. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +0 -340
  1522. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +0 -455
  1523. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +0 -231
  1524. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +0 -1936
  1525. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +0 -1925
  1526. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +0 -249
  1527. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +0 -593
  1528. transformers/models/fast_vlm/__init__.py +0 -27
  1529. transformers/models/fast_vlm/configuration_fast_vlm.py +0 -137
  1530. transformers/models/fast_vlm/modeling_fast_vlm.py +0 -432
  1531. transformers/models/fast_vlm/modular_fast_vlm.py +0 -373
  1532. transformers/models/glm4_moe_lite/__init__.py +0 -28
  1533. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +0 -233
  1534. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +0 -740
  1535. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +0 -302
  1536. transformers/models/glm_image/__init__.py +0 -31
  1537. transformers/models/glm_image/configuration_glm_image.py +0 -351
  1538. transformers/models/glm_image/image_processing_glm_image.py +0 -503
  1539. transformers/models/glm_image/image_processing_glm_image_fast.py +0 -294
  1540. transformers/models/glm_image/modeling_glm_image.py +0 -1642
  1541. transformers/models/glm_image/modular_glm_image.py +0 -1531
  1542. transformers/models/glm_image/processing_glm_image.py +0 -217
  1543. transformers/models/glmasr/__init__.py +0 -29
  1544. transformers/models/glmasr/configuration_glmasr.py +0 -196
  1545. transformers/models/glmasr/modeling_glmasr.py +0 -517
  1546. transformers/models/glmasr/modular_glmasr.py +0 -443
  1547. transformers/models/glmasr/processing_glmasr.py +0 -331
  1548. transformers/models/jais2/__init__.py +0 -27
  1549. transformers/models/jais2/configuration_jais2.py +0 -148
  1550. transformers/models/jais2/modeling_jais2.py +0 -484
  1551. transformers/models/jais2/modular_jais2.py +0 -194
  1552. transformers/models/lasr/__init__.py +0 -29
  1553. transformers/models/lasr/configuration_lasr.py +0 -244
  1554. transformers/models/lasr/feature_extraction_lasr.py +0 -275
  1555. transformers/models/lasr/modeling_lasr.py +0 -727
  1556. transformers/models/lasr/modular_lasr.py +0 -574
  1557. transformers/models/lasr/processing_lasr.py +0 -100
  1558. transformers/models/lasr/tokenization_lasr.py +0 -184
  1559. transformers/models/lighton_ocr/__init__.py +0 -28
  1560. transformers/models/lighton_ocr/configuration_lighton_ocr.py +0 -128
  1561. transformers/models/lighton_ocr/modeling_lighton_ocr.py +0 -463
  1562. transformers/models/lighton_ocr/modular_lighton_ocr.py +0 -404
  1563. transformers/models/lighton_ocr/processing_lighton_ocr.py +0 -229
  1564. transformers/models/lw_detr/__init__.py +0 -27
  1565. transformers/models/lw_detr/configuration_lw_detr.py +0 -374
  1566. transformers/models/lw_detr/modeling_lw_detr.py +0 -1702
  1567. transformers/models/lw_detr/modular_lw_detr.py +0 -1615
  1568. transformers/models/minimax_m2/__init__.py +0 -28
  1569. transformers/models/minimax_m2/configuration_minimax_m2.py +0 -188
  1570. transformers/models/minimax_m2/modeling_minimax_m2.py +0 -704
  1571. transformers/models/minimax_m2/modular_minimax_m2.py +0 -346
  1572. transformers/models/paddleocr_vl/__init__.py +0 -31
  1573. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +0 -335
  1574. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +0 -503
  1575. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +0 -209
  1576. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +0 -1683
  1577. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +0 -1380
  1578. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +0 -133
  1579. transformers/models/pe_audio/__init__.py +0 -29
  1580. transformers/models/pe_audio/configuration_pe_audio.py +0 -204
  1581. transformers/models/pe_audio/feature_extraction_pe_audio.py +0 -160
  1582. transformers/models/pe_audio/modeling_pe_audio.py +0 -819
  1583. transformers/models/pe_audio/modular_pe_audio.py +0 -298
  1584. transformers/models/pe_audio_video/__init__.py +0 -28
  1585. transformers/models/pe_audio_video/configuration_pe_audio_video.py +0 -223
  1586. transformers/models/pe_audio_video/modeling_pe_audio_video.py +0 -971
  1587. transformers/models/pe_audio_video/modular_pe_audio_video.py +0 -763
  1588. transformers/models/pe_video/__init__.py +0 -29
  1589. transformers/models/pe_video/configuration_pe_video.py +0 -209
  1590. transformers/models/pe_video/modeling_pe_video.py +0 -647
  1591. transformers/models/pe_video/modular_pe_video.py +0 -231
  1592. transformers/models/pe_video/processing_pe_video.py +0 -10
  1593. transformers/models/pe_video/video_processing_pe_video.py +0 -64
  1594. transformers/models/pixio/__init__.py +0 -29
  1595. transformers/models/pixio/configuration_pixio.py +0 -150
  1596. transformers/models/pixio/modeling_pixio.py +0 -507
  1597. transformers/models/pixio/modular_pixio.py +0 -403
  1598. transformers/models/solar_open/__init__.py +0 -27
  1599. transformers/models/solar_open/configuration_solar_open.py +0 -184
  1600. transformers/models/solar_open/modeling_solar_open.py +0 -642
  1601. transformers/models/solar_open/modular_solar_open.py +0 -224
  1602. transformers/trainer_jit_checkpoint.py +0 -125
  1603. transformers-5.0.0.dist-info/RECORD +0 -2068
  1604. {transformers-5.0.0.dist-info/licenses → transformers-5.0.0rc0.dist-info}/LICENSE +0 -0
  1605. {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/entry_points.txt +0 -0
  1606. {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/top_level.txt +0 -0
@@ -14,39 +14,39 @@
14
14
  import os
15
15
  import re
16
16
  import shutil
17
- from collections.abc import Callable, Sequence
17
+ import warnings
18
+ from collections.abc import Callable, Mapping, Sized
18
19
  from enum import Enum
19
20
  from pathlib import Path
20
- from typing import Any, Literal, Union, overload
21
+ from typing import Any, Union, overload
21
22
 
22
23
  import numpy as np
23
24
  from huggingface_hub import create_repo
24
25
 
25
26
  from transformers.audio_utils import load_audio_as
26
27
  from transformers.tokenization_utils_base import (
28
+ LARGE_INTEGER,
27
29
  VERY_LARGE_INTEGER,
28
- AddedToken,
29
30
  BatchEncoding,
30
31
  EncodedInput,
31
32
  PreTokenizedInput,
32
- PreTrainedTokenizerBase,
33
33
  TextInput,
34
34
  TruncationStrategy,
35
35
  )
36
36
  from transformers.utils import PaddingStrategy, TensorType, add_end_docstrings, logging, to_py_obj
37
+ from transformers.utils.generic import is_torch_tensor
38
+ from transformers.utils.hub import PushToHubMixin
37
39
  from transformers.utils.import_utils import is_mistral_common_available, is_torch_available, requires
38
40
 
39
41
 
40
42
  if is_mistral_common_available():
41
43
  from mistral_common.protocol.instruct.request import ChatCompletionRequest
42
44
  from mistral_common.protocol.instruct.validator import ValidationMode
43
- from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, SpecialTokens
45
+ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, TokenizerVersion
46
+ from mistral_common.tokens.tokenizers.image import MultiModalVersion
44
47
  from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
45
48
  from mistral_common.tokens.tokenizers.tekken import Tekkenizer
46
- from mistral_common.tokens.tokenizers.utils import (
47
- download_tokenizer_from_hf_hub,
48
- get_one_valid_tokenizer_file,
49
- )
49
+ from mistral_common.tokens.tokenizers.utils import download_tokenizer_from_hf_hub
50
50
 
51
51
 
52
52
  if is_torch_available():
@@ -103,10 +103,6 @@ ENCODE_KWARGS_DOCSTRING = r"""
103
103
  """
104
104
 
105
105
  ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
106
- return_token_type_ids (`bool`, *optional*):
107
- Whether to return token type IDs. For `MistralCommonBackend` it returns a list of zeros of the sequence length as only one sequence is supported.
108
-
109
- [What are token type IDs?](../glossary#token-type-ids)
110
106
  return_attention_mask (`bool`, *optional*):
111
107
  Whether to return the attention mask. If left to the default, will return the attention mask according
112
108
  to the specific tokenizer's default, defined by the `return_outputs` attribute.
@@ -122,8 +118,6 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
122
118
  Whether or not to return the lengths of the encoded inputs.
123
119
  verbose (`bool`, *optional*, defaults to `True`):
124
120
  Whether or not to print more information and warnings.
125
- return_offsets_mapping (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
126
- split_special_tokens (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
127
121
  **kwargs: passed to the `self.tokenize()` method
128
122
 
129
123
  Return:
@@ -155,35 +149,8 @@ class MistralTokenizerType(str, Enum):
155
149
  tekken = "tekken"
156
150
 
157
151
 
158
- @overload
159
- def _maybe_remove_lang(text: str, skip_special_tokens: bool) -> str: ...
160
- @overload
161
- def _maybe_remove_lang(text: list[str], skip_special_tokens: bool) -> list[str]: ...
162
- def _maybe_remove_lang(text: str | list[str], skip_special_tokens: bool) -> str | list[str]:
163
- # in the specific case of Voxtral, the added f"lang:xx" (always a two char language code since it follows ISO 639-1 alpha-2 format)
164
- # is not considered as a special token by mistral-common and is encoded/ decoded as normal text.
165
- # Nevertheless we should remove it to ease users life.
166
- if not skip_special_tokens:
167
- return text
168
-
169
- if isinstance(text, str):
170
- return re.sub(r"^lang:[a-z]{2}", "", text)
171
-
172
- return [re.sub(r"^lang:[a-z]{2}", "", string) for string in text]
173
-
174
-
175
- _MAP_SPECIAL_TOKENS = {
176
- "bos_token": SpecialTokens.bos.value,
177
- "eos_token": SpecialTokens.eos.value,
178
- "pad_token": SpecialTokens.pad.value,
179
- "unk_token": SpecialTokens.unk.value,
180
- }
181
-
182
- _VALID_INIT_KWARGS = {"_from_auto", "backend", "files_loaded"}
183
-
184
-
185
152
  @requires(backends=("mistral-common",))
186
- class MistralCommonBackend(PreTrainedTokenizerBase):
153
+ class MistralCommonBackend(PushToHubMixin):
187
154
  """
188
155
  Class to wrap `mistral-common` tokenizers.
189
156
 
@@ -198,13 +165,34 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
198
165
  For more info on `mistral-common`, see [mistral-common](https://github.com/mistralai/mistral-common).
199
166
 
200
167
  This class is a wrapper around a `mistral_common.tokens.tokenizers.mistral.MistralTokenizer`.
201
- It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer and inherits from the `PreTrainedTokenizerBase` class.
202
-
203
- Here are the key behavior differences with the `PythonBackend` class:
204
-
205
- - Pair of sequences are not supported. The signature has been kept for compatibility but all arguments related to pair of sequences are ignored. The return values for pairs are returned as `None`.
168
+ It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer.
169
+
170
+ Supports the following methods from the `PreTrainedTokenizerBase` class:
171
+
172
+ - [`~MistralCommonBackend.get_vocab`]: Returns the vocabulary as a dictionary of token to index.
173
+ This is a lossy conversion for Tekkenizer as some decoding errors are collapsed into the same token.
174
+ - [`~MistralCommonBackend.encode`]: Encode a string to a list of integers.
175
+ - [`~MistralCommonBackend.decode`]: Decode a list of integers to a string.
176
+ - [`~MistralCommonBackend.batch_decode`]: Decode a batch of list of integers to a list of strings.
177
+ - [`~MistralCommonBackend.convert_tokens_to_ids`]: Convert a list of tokens to a list of integers.
178
+ - [`~MistralCommonBackend.convert_ids_to_tokens`]: Convert a list of integers to a list of tokens.
179
+ - [`~MistralCommonBackend.tokenize`]: Tokenize a string.
180
+ - [`~MistralCommonBackend.get_special_tokens_mask`]: Get the special tokens mask for a list of tokens.
181
+ - [`~MistralCommonBackend.prepare_for_model`]: Prepare a list of inputs for the model.
182
+ - [`~MistralCommonBackend.pad`]: Pad a list of inputs to the same length.
183
+ - [`~MistralCommonBackend.truncate_sequences`]: Truncate a list of sequences to the same length.
184
+ - [`~MistralCommonBackend.apply_chat_template`]: Apply a chat template to a list of messages.
185
+ - [`~MistralCommonBackend.__call__`]: Tokenize a string or a list of strings.
186
+ - [`~MistralCommonBackend.from_pretrained`]: Download and cache a pretrained tokenizer from the Hugging Face model hub or local directory.
187
+ - [`~MistralCommonBackend.save_pretrained`]: Save a tokenizer to a directory, so it can be reloaded using the `from_pretrained` class method.
188
+ - [`~MistralCommonBackend.push_to_hub`]: Upload tokenizer to the Hugging Face model hub.
189
+
190
+ Here are the key differences with the `PreTrainedTokenizerBase` class:
191
+
192
+ - Pair of sequences are not supported. The signature have been kept for compatibility but all arguments related to pair of sequences are ignored. The return values of pairs are returned as `None`.
206
193
  - The `is_split_into_words` argument is not supported.
207
- - It is not possible to add new tokens to the tokenizer. Special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("<s>")` will not return the ID of the `<s>` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `"<s>"`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens).
194
+ - The `return_token_type_ids` argument is not supported.
195
+ - It is not possible to add new tokens to the tokenizer. Also the special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("<s>")` will not return the ID of the `<s>` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `"<s>"`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens).
208
196
 
209
197
  If you have suggestions to improve this class, please open an issue on the [mistral-common GitHub repository](https://github.com/mistralai/mistral-common/issues) if it is related to the tokenizer or on the [Transformers GitHub repository](https://github.com/huggingface/transformers/issues) if it is related to the Hugging Face interface.
210
198
  """
@@ -212,12 +200,6 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
212
200
  model_input_names: list[str] = ["input_ids", "attention_mask"]
213
201
  padding_side: str = "left"
214
202
  truncation_side: str = "right"
215
- SPECIAL_TOKENS_ATTRIBUTES = [
216
- "bos_token",
217
- "eos_token",
218
- "unk_token",
219
- "pad_token",
220
- ]
221
203
 
222
204
  def __init__(
223
205
  self,
@@ -244,7 +226,7 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
244
226
  Path to the tokenizer file to load the `MistralTokenizer`.
245
227
  mode (`Union[str, ValidationMode]`, *optional*, defaults to `ValidationMode.test`):
246
228
  The mode to use for the tokenizer. This will be passed to the `MistralTokenizer` constructor. Possible values are:
247
- - `"finetuning"` or `ValidationMode.finetuning`: The fine-tuning mode.
229
+ - `"finetuning"` or `ValidationMode.finetuning`: The finetuning mode.
248
230
  - `"test"` or `ValidationMode.test`: The test mode.
249
231
  It changes how the tokenizer validates the input and prepares the request to the model.
250
232
  model_max_length (`int`, *optional*):
@@ -258,49 +240,60 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
258
240
  truncation_side (`str`, *optional*):
259
241
  The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
260
242
  Default value is picked from the class attribute of the same name.
261
- model_input_names (`List[str]`, *optional*):
243
+ model_input_names (`List[string]`, *optional*):
262
244
  The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
263
245
  `"attention_mask"`). Default value is picked from the class attribute of the same name.
264
246
  clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
265
- Whether or not the model should clean up the spaces that were added when splitting the input text during the
247
+ Whether or not the model should cleanup the spaces that were added when splitting the input text during the
266
248
  tokenization process.
267
249
  """
268
- if kwargs and not set(kwargs.keys()).issubset(_VALID_INIT_KWARGS):
250
+ if kwargs:
269
251
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported to init `MistralCommonBackend`.")
270
252
 
271
- self.init_kwargs = {
272
- "tokenizer_path": tokenizer_path,
273
- "mode": mode,
274
- "model_max_length": model_max_length,
275
- "padding_side": padding_side,
276
- "truncation_side": truncation_side,
277
- "model_input_names": model_input_names,
278
- "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
279
- }
280
253
  self._tokenizer_path = Path(tokenizer_path)
281
254
  self._mode = self._get_validation_mode(mode)
282
-
283
255
  self.tokenizer: MistralTokenizer = MistralTokenizer.from_file(str(self._tokenizer_path), mode=self._mode)
284
256
  self._tokenizer_type = (
285
257
  MistralTokenizerType.tekken
286
258
  if isinstance(self.tokenizer.instruct_tokenizer.tokenizer, Tekkenizer)
287
259
  else MistralTokenizerType.spm
288
260
  )
289
- self._cache_get_vocab: dict[str, int] | None = None
261
+ self.truncation_side = truncation_side
262
+ self.padding_side = padding_side
263
+ self.model_max_length = model_max_length
264
+ self.cleanup_tokenization_spaces = clean_up_tokenization_spaces
265
+ self.deprecation_warnings = {} # Use to store when we have already noticed a deprecation warning (avoid overlogging).
266
+ self._all_special_tokens_ids = self._get_all_special_ids()
267
+
268
+ if model_input_names is not None:
269
+ if (
270
+ not isinstance(model_input_names, (list, tuple))
271
+ and len(model_input_names) == 0
272
+ and not all(isinstance(i, str) for i in model_input_names)
273
+ ):
274
+ raise ValueError(
275
+ "`model_input_names` should be a non-empty list or tuple of str but got an empty value."
276
+ )
277
+ self.model_input_names = model_input_names
290
278
 
291
- self._all_special_ids = self._get_all_special_ids()
292
- self._all_special_tokens = self.convert_ids_to_tokens(self.all_special_ids)
279
+ self._cache_get_vocab: dict[str, int] | None = None
293
280
 
294
- super().__init__(
295
- truncation_side=truncation_side,
296
- padding_side=padding_side,
297
- model_max_length=model_max_length,
298
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
299
- extra_special_tokens=None, # Not used by this backend.
300
- model_specific_special_tokens=None, # Not used by this backend.
301
- model_input_names=model_input_names or self.model_input_names,
302
- **_MAP_SPECIAL_TOKENS,
303
- **kwargs,
281
+ @staticmethod
282
+ def clean_up_tokenization(text: str) -> str:
283
+ """
284
+ Clean up a list of simple English tokenization artifacts like spaces before punctuation.
285
+ """
286
+ return (
287
+ text.replace(" .", ".")
288
+ .replace(" ?", "?")
289
+ .replace(" !", "!")
290
+ .replace(" ,", ",")
291
+ .replace(" ' ", "'")
292
+ .replace(" n't", "n't")
293
+ .replace(" 'm", "'m")
294
+ .replace(" 's", "'s")
295
+ .replace(" 've", "'ve")
296
+ .replace(" 're", "'re")
304
297
  )
305
298
 
306
299
  @property
@@ -313,19 +306,75 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
313
306
  """
314
307
  return self._mode
315
308
 
309
+ @property
310
+ def bos_token_id(self) -> int:
311
+ """
312
+ Id of the beginning of sentence token in the vocabulary.
313
+ """
314
+ return self.tokenizer.instruct_tokenizer.tokenizer.bos_id
315
+
316
+ @property
317
+ def eos_token_id(self) -> int:
318
+ """
319
+ Id of the end of sentence token in the vocabulary.
320
+ """
321
+ return self.tokenizer.instruct_tokenizer.tokenizer.eos_id
322
+
323
+ @property
324
+ def unk_token_id(self) -> int:
325
+ """
326
+ Id of the unknown token in the vocabulary.
327
+ """
328
+ return self.tokenizer.instruct_tokenizer.tokenizer.unk_id
329
+
330
+ @property
331
+ def pad_token_id(self) -> int:
332
+ """
333
+ Id of the padding token in the vocabulary.
334
+ """
335
+ return self.tokenizer.instruct_tokenizer.tokenizer.pad_id
336
+
337
+ @property
338
+ def bos_token(self) -> str:
339
+ """
340
+ String associated to the beginning of sentence token in the vocabulary.
341
+ """
342
+ return self.convert_ids_to_tokens(self.bos_token_id)
343
+
344
+ @property
345
+ def eos_token(self) -> str:
346
+ """
347
+ String associated to the end of sentence token in the vocabulary.
348
+ """
349
+ return self.convert_ids_to_tokens(self.eos_token_id)
350
+
351
+ @property
352
+ def unk_token(self) -> str:
353
+ """
354
+ String associated to the unknown token in the vocabulary.
355
+ """
356
+ return self.convert_ids_to_tokens(self.unk_token_id)
357
+
358
+ @property
359
+ def pad_token(self) -> str:
360
+ """
361
+ String associated to the padding token in the vocabulary.
362
+ """
363
+ return self.convert_ids_to_tokens(self.pad_token_id)
364
+
316
365
  @property
317
366
  def all_special_ids(self) -> list[int]:
318
367
  """
319
368
  `list[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.).
320
369
  """
321
- return sorted(self._all_special_ids)
370
+ return sorted(self._all_special_tokens_ids)
322
371
 
323
372
  @property
324
373
  def all_special_tokens(self) -> list[str]:
325
374
  """
326
375
  `list[str]`: A list of all unique special tokens.
327
376
  """
328
- return self._all_special_tokens
377
+ return self.convert_ids_to_tokens(self.all_special_ids)
329
378
 
330
379
  @property
331
380
  def vocab_size(self) -> int:
@@ -386,8 +435,6 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
386
435
  padding_side: str | None = None,
387
436
  return_tensors: str | TensorType | None = None,
388
437
  verbose: bool = True,
389
- return_offsets_mapping: Literal[False] = False,
390
- split_special_tokens: Literal[False] = False,
391
438
  **kwargs,
392
439
  ) -> list[int]:
393
440
  """
@@ -399,81 +446,37 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
399
446
  text_pair (`None`, *optional*):
400
447
  Not supported by `MistralCommonBackend.encode`. Kept to match `PreTrainedTokenizerBase.encode` signature.
401
448
  """
402
- if return_offsets_mapping or split_special_tokens:
403
- raise ValueError(
404
- "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
405
- )
406
-
407
- if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND, "only_first", "only_second"]:
408
- raise ValueError(
409
- "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
410
- )
411
-
412
449
  if kwargs:
413
450
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.encode`.")
414
-
415
451
  if text_pair:
416
452
  raise ValueError("`MistralCommonBackend.encode` does not support `text_pair`.")
417
453
 
418
- return super().encode(
419
- text=text,
420
- text_pair=text_pair,
421
- add_special_tokens=add_special_tokens,
454
+ padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
422
455
  padding=padding,
423
456
  truncation=truncation,
424
457
  max_length=max_length,
425
- stride=stride,
426
- return_tensors=return_tensors,
427
458
  pad_to_multiple_of=pad_to_multiple_of,
428
- padding_side=padding_side,
429
459
  verbose=verbose,
430
460
  )
431
461
 
432
- def _decode(
433
- self,
434
- token_ids: int | list[int],
435
- skip_special_tokens: bool = False,
436
- clean_up_tokenization_spaces: bool | None = None,
437
- **kwargs,
438
- ) -> str:
439
- if kwargs:
440
- raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.decode`.")
441
-
442
- token_ids = to_py_obj(token_ids)
443
-
444
- if isinstance(token_ids, int):
445
- token_ids = [token_ids]
446
-
447
- special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
448
-
449
- text = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
450
-
451
- # Apply tokenizer-specific cleanup if available and requested
452
- clean_up_tokenization_spaces = (
453
- clean_up_tokenization_spaces
454
- if clean_up_tokenization_spaces is not None
455
- else self.clean_up_tokenization_spaces
462
+ encoded_inputs = self._encode_plus(
463
+ text,
464
+ add_special_tokens=add_special_tokens,
465
+ padding_strategy=padding_strategy,
466
+ truncation_strategy=truncation_strategy,
467
+ max_length=max_length,
468
+ stride=stride,
469
+ pad_to_multiple_of=pad_to_multiple_of,
470
+ padding_side=padding_side,
471
+ return_tensors=return_tensors,
472
+ return_attention_mask=False,
473
+ return_overflowing_tokens=False,
474
+ return_special_tokens_mask=False,
475
+ return_length=False,
476
+ verbose=verbose,
456
477
  )
457
- if clean_up_tokenization_spaces:
458
- # Call custom cleanup method if it exists (e.g., for CLVP's [SPACE] token replacement)
459
- if hasattr(self, "clean_up_tokenization") and callable(self.clean_up_tokenization):
460
- text = self.clean_up_tokenization(text)
461
- else:
462
- # Otherwise apply standard cleanup
463
- text = (
464
- text.replace(" .", ".")
465
- .replace(" ?", "?")
466
- .replace(" !", "!")
467
- .replace(" ,", ",")
468
- .replace(" ' ", "'")
469
- .replace(" n't", "n't")
470
- .replace(" 'm", "'m")
471
- .replace(" 's", "'s")
472
- .replace(" 've", "'ve")
473
- .replace(" 're", "'re")
474
- )
475
478
 
476
- return _maybe_remove_lang(text=text, skip_special_tokens=skip_special_tokens)
479
+ return encoded_inputs["input_ids"]
477
480
 
478
481
  def decode(
479
482
  self,
@@ -481,7 +484,7 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
481
484
  skip_special_tokens: bool = False,
482
485
  clean_up_tokenization_spaces: bool | None = None,
483
486
  **kwargs,
484
- ) -> str | list[str]:
487
+ ) -> Union[str, list[str]]:
485
488
  """
486
489
  Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
487
490
  tokens and clean up tokenization spaces.
@@ -506,7 +509,16 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
506
509
  if kwargs:
507
510
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.decode`.")
508
511
 
509
- return super().decode(
512
+ token_ids = to_py_obj(token_ids)
513
+
514
+ if isinstance(token_ids, (list, tuple)) and len(token_ids) > 0 and isinstance(token_ids[0], (list, tuple)):
515
+ return self._batch_decode(
516
+ sequences=token_ids,
517
+ skip_special_tokens=skip_special_tokens,
518
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
519
+ )
520
+
521
+ return self._decode(
510
522
  token_ids=token_ids,
511
523
  skip_special_tokens=skip_special_tokens,
512
524
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
@@ -543,12 +555,63 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
543
555
  if kwargs:
544
556
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.batch_decode`.")
545
557
 
546
- return super().batch_decode(
558
+ return self._batch_decode(
547
559
  sequences=sequences,
548
560
  skip_special_tokens=skip_special_tokens,
549
561
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
550
562
  )
551
563
 
564
+ def _decode(
565
+ self,
566
+ token_ids: Union[int, list[int], list[list[int]], np.ndarray, "torch.Tensor"],
567
+ skip_special_tokens: bool = False,
568
+ clean_up_tokenization_spaces: bool | None = None,
569
+ ) -> str:
570
+ clean_up_tokenization_spaces = clean_up_tokenization_spaces or self.cleanup_tokenization_spaces
571
+
572
+ # Convert inputs to python lists
573
+ if isinstance(token_ids, int):
574
+ token_ids = [token_ids]
575
+
576
+ token_ids = to_py_obj(token_ids)
577
+
578
+ special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
579
+
580
+ decoded_string = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
581
+ if clean_up_tokenization_spaces:
582
+ decoded_string = self.clean_up_tokenization(decoded_string)
583
+
584
+ # in the specific case of Voxtral, the added f"lang:xx" (always a two char language code since it follows ISO 639-1 alpha-2 format)
585
+ # is not considered as a special token by mistral-common and is encoded/ decoded as normal text.
586
+ # Nevertheless we should remove it to ease users life.
587
+ if skip_special_tokens:
588
+ decoded_string = re.sub(r"^lang:[a-z]{2}", "", decoded_string)
589
+
590
+ return decoded_string
591
+
592
+ def _batch_decode(
593
+ self,
594
+ sequences: Union[list[int], list[list[int]], np.ndarray, "torch.Tensor"],
595
+ skip_special_tokens: bool = False,
596
+ clean_up_tokenization_spaces: bool | None = None,
597
+ ) -> list[str]:
598
+ return [
599
+ self._decode(
600
+ seq,
601
+ skip_special_tokens=skip_special_tokens,
602
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
603
+ )
604
+ for seq in sequences
605
+ ]
606
+
607
+ def _is_control_token(self, token_id: int) -> bool:
608
+ if self._tokenizer_type == MistralTokenizerType.spm:
609
+ return token_id in self.tokenizer.instruct_tokenizer.tokenizer._control_tokens
610
+ elif self._tokenizer_type == MistralTokenizerType.tekken:
611
+ return token_id < self.tokenizer.instruct_tokenizer.tokenizer.num_special_tokens
612
+ else:
613
+ raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
614
+
552
615
  @overload
553
616
  def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
554
617
  @overload
@@ -569,22 +632,22 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
569
632
  """
570
633
 
571
634
  if isinstance(ids, int):
572
- return_int = True
635
+ one_token = True
573
636
  ids = [ids]
574
637
  else:
575
- return_int = False
638
+ one_token = False
576
639
 
577
640
  tokens: list[str] = []
578
641
  for token_id in ids:
579
- if self.tokenizer.instruct_tokenizer.tokenizer.is_special(token_id) and skip_special_tokens:
642
+ if self._is_control_token(token_id) and skip_special_tokens:
580
643
  continue
581
644
  tokens.append(self.tokenizer.instruct_tokenizer.tokenizer.id_to_piece(token_id))
582
645
 
583
- if return_int and tokens == []:
584
- raise ValueError(f"Invalid token id {ids[0]}.")
585
- elif return_int:
586
- return tokens[0]
646
+ if one_token:
647
+ if tokens == []:
648
+ raise ValueError(f"Invalid token id {ids}.")
587
649
 
650
+ return tokens[0]
588
651
  return tokens
589
652
 
590
653
  def _tekken_piece_to_id(self, piece: str, warn: bool) -> int:
@@ -645,13 +708,7 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
645
708
  tokens_ids = self.tokenizer.instruct_tokenizer.tokenizer.encode(text, bos=add_special_tokens, eos=add_eos)
646
709
  return tokens_ids
647
710
 
648
- def tokenize(
649
- self,
650
- text: TextInput,
651
- return_offsets_mapping: Literal[False] = False,
652
- split_special_tokens: Literal[False] = False,
653
- **kwargs,
654
- ) -> list[str]:
711
+ def tokenize(self, text: TextInput, **kwargs) -> list[str]:
655
712
  """
656
713
  Converts a string into a sequence of tokens, using the tokenizer.
657
714
 
@@ -660,8 +717,6 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
660
717
  Args:
661
718
  text (`str`):
662
719
  The sequence to be encoded.
663
- return_offsets_mapping (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
664
- split_special_tokens (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
665
720
  **kwargs (additional keyword arguments):
666
721
  Not supported by `MistralCommonBackend.tokenize`.
667
722
  Will raise an error if used.
@@ -669,164 +724,40 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
669
724
  Returns:
670
725
  `list[str]`: The list of tokens.
671
726
  """
672
- if return_offsets_mapping or split_special_tokens:
673
- raise ValueError(
674
- "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
675
- )
676
-
677
727
  if kwargs:
678
728
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.tokenize`.")
679
729
 
680
730
  return self.convert_ids_to_tokens(self._text_to_ids(text, add_special_tokens=False), skip_special_tokens=False)
681
731
 
682
- def _get_all_special_ids(self) -> set[int]:
683
- if self._tokenizer_type == MistralTokenizerType.tekken:
684
- return self.tokenizer.instruct_tokenizer.tokenizer._special_token_ids
685
- elif self._tokenizer_type == MistralTokenizerType.spm:
686
- return {
687
- token_id
688
- for token_id in range(self.tokenizer.instruct_tokenizer.tokenizer.n_words)
689
- if self.tokenizer.instruct_tokenizer.tokenizer.is_special(token_id)
690
- }
691
- else:
692
- raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
693
-
694
- def get_special_tokens_mask(
695
- self, token_ids_0: list[int], token_ids_1: None = None, already_has_special_tokens: bool = False
696
- ) -> list[int]:
697
- """
698
- Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
699
- special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
700
-
701
- Args:
702
- token_ids_0 (`list[int]`): List of ids of the sequence.
703
- token_ids_1 (`None`, *optional*): None, kept to match Transformers' implementation.
704
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
705
- Whether or not the token list is already formatted with special tokens for the model.
706
-
707
- Returns:
708
- A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
709
- """
710
- if token_ids_1 is not None:
711
- raise ValueError(
712
- "`token_ids_1` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
713
- )
714
-
715
- if already_has_special_tokens:
716
- return [1 if int(token_id) in self._all_special_ids else 0 for token_id in token_ids_0]
717
-
718
- if self.mode == ValidationMode.test:
719
- # [BOS] seq0
720
- return [1] + ([0] * len(token_ids_0))
721
- else:
722
- # [BOS] seq0 [EOS]
723
- return [1] + ([0] * len(token_ids_0)) + [1]
724
-
725
- def _encode_plus( # type: ignore[override]
732
+ def _encode_plus(
726
733
  self,
727
- text: TextInput | PreTokenizedInput | EncodedInput,
728
- text_pair: None = None,
734
+ text: TextInput | EncodedInput,
729
735
  add_special_tokens: bool = True,
730
736
  padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
731
737
  truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
732
738
  max_length: int | None = None,
733
739
  stride: int = 0,
734
- is_split_into_words: bool = False,
735
740
  pad_to_multiple_of: int | None = None,
736
741
  padding_side: str | None = None,
737
742
  return_tensors: str | TensorType | None = None,
738
- return_token_type_ids: bool | None = None,
739
743
  return_attention_mask: bool | None = None,
740
744
  return_overflowing_tokens: bool = False,
741
745
  return_special_tokens_mask: bool = False,
742
746
  return_length: bool = False,
743
747
  verbose: bool = True,
744
- return_offsets_mapping: Literal[False] = False,
745
- split_special_tokens: Literal[False] = False,
746
- **kwargs,
747
748
  ) -> BatchEncoding:
748
- # Detect batched inputs (list of sequences)
749
- if text_pair is not None:
750
- raise ValueError("`MistralCommonBackend` does not support `text_pair != None` for `_encode_plus`.")
751
-
752
- if return_offsets_mapping or split_special_tokens:
753
- raise ValueError(
754
- "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
755
- )
756
-
757
- if kwargs:
758
- raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend._encode_plus`.")
759
-
760
- is_batched = isinstance(text, (list, tuple)) and (
761
- (not text and not is_split_into_words)
762
- or (text and is_split_into_words and isinstance(text[0], (list, tuple)))
763
- or (text and not is_split_into_words and isinstance(text[0], (str, list, tuple)))
764
- )
765
-
766
- if is_batched:
767
- batch_outputs = {}
768
- one_overflowed = False
769
- for current_text in text:
770
- current_output = self._encode_plus(
771
- text=current_text,
772
- text_pair=None,
773
- add_special_tokens=add_special_tokens,
774
- padding_strategy=PaddingStrategy.DO_NOT_PAD, # we pad in batch afterward
775
- truncation_strategy=truncation_strategy,
776
- max_length=max_length,
777
- stride=stride,
778
- is_split_into_words=is_split_into_words,
779
- pad_to_multiple_of=None, # we pad in batch afterward
780
- padding_side=None, # we pad in batch afterward
781
- return_tensors=None, # We convert the whole batch to tensors at the end
782
- return_token_type_ids=return_token_type_ids,
783
- return_attention_mask=False, # we pad in batch afterward
784
- return_overflowing_tokens=return_overflowing_tokens,
785
- return_special_tokens_mask=return_special_tokens_mask,
786
- return_length=return_length,
787
- verbose=verbose,
788
- )
789
- for key, value in current_output.items():
790
- batch_outputs.setdefault(key, []).append(value)
791
-
792
- # To ensure the list is built for each sample, we need to add this.
793
- if return_overflowing_tokens and not return_tensors:
794
- if "overflowing_tokens" not in current_output:
795
- batch_outputs.setdefault("overflowing_tokens", []).append([0])
796
- batch_outputs.setdefault("num_truncated_tokens", []).append([0])
797
- else:
798
- one_overflowed = True
799
-
800
- # Remove overflow-related keys before tensor conversion if return_tensors is set
801
- # Slow tokenizers don't support returning these as tensors
802
- if return_overflowing_tokens and (return_tensors or not one_overflowed):
803
- batch_outputs.pop("overflowing_tokens", None)
804
- batch_outputs.pop("num_truncated_tokens", None)
805
-
806
- batch_outputs = self.pad(
807
- batch_outputs,
808
- padding=padding_strategy.value,
809
- max_length=max_length,
810
- pad_to_multiple_of=pad_to_multiple_of,
811
- padding_side=padding_side,
812
- return_attention_mask=return_attention_mask,
813
- )
814
-
815
- return BatchEncoding(batch_outputs, tensor_type=return_tensors)
816
-
817
749
  def get_input_ids(text):
818
750
  if isinstance(text, str):
819
- return self._text_to_ids(text, False)
751
+ return self._text_to_ids(text, add_special_tokens)
820
752
  elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
821
753
  return text
822
754
  else:
823
755
  raise ValueError(f"Input {text} is not valid. Should be a string, or a list/tuple of integers.")
824
756
 
825
- first_ids = get_input_ids(text)
757
+ ids = get_input_ids(text)
826
758
 
827
759
  return self.prepare_for_model(
828
- first_ids,
829
- pair_ids=None,
760
+ ids,
830
761
  add_special_tokens=add_special_tokens,
831
762
  padding=padding_strategy.value,
832
763
  truncation=truncation_strategy.value,
@@ -837,128 +768,242 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
837
768
  return_tensors=return_tensors,
838
769
  prepend_batch_axis=True,
839
770
  return_attention_mask=return_attention_mask,
840
- return_token_type_ids=return_token_type_ids,
841
771
  return_overflowing_tokens=return_overflowing_tokens,
842
772
  return_special_tokens_mask=return_special_tokens_mask,
843
773
  return_length=return_length,
844
774
  verbose=verbose,
845
775
  )
846
776
 
847
- @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
848
- def prepare_for_model(
777
+ def _batch_encode_plus(
849
778
  self,
850
- ids: list[int],
851
- pair_ids: None = None,
779
+ batch_text: list[TextInput] | list[EncodedInput],
852
780
  add_special_tokens: bool = True,
853
- padding: bool | str | PaddingStrategy = False,
854
- truncation: bool | str | TruncationStrategy | None = None,
781
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
782
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
855
783
  max_length: int | None = None,
856
784
  stride: int = 0,
857
785
  pad_to_multiple_of: int | None = None,
858
786
  padding_side: str | None = None,
859
787
  return_tensors: str | TensorType | None = None,
860
- return_token_type_ids: bool | None = None,
861
788
  return_attention_mask: bool | None = None,
862
789
  return_overflowing_tokens: bool = False,
863
790
  return_special_tokens_mask: bool = False,
864
791
  return_length: bool = False,
865
792
  verbose: bool = True,
866
- prepend_batch_axis: bool = False,
867
- return_offsets_mapping: Literal[False] = False,
868
- split_special_tokens: Literal[False] = False,
869
- **kwargs,
870
793
  ) -> BatchEncoding:
794
+ def get_input_ids(text):
795
+ if isinstance(text, str):
796
+ return self._text_to_ids(text, add_special_tokens)
797
+ elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
798
+ return text
799
+ else:
800
+ raise ValueError("Input is not valid. Should be a string or a list/tuple of integers.")
801
+
802
+ input_ids = []
803
+ for ids in batch_text:
804
+ input_ids.append(get_input_ids(ids))
805
+
806
+ batch_outputs = self._batch_prepare_for_model(
807
+ input_ids,
808
+ add_special_tokens=add_special_tokens,
809
+ padding_strategy=padding_strategy,
810
+ truncation_strategy=truncation_strategy,
811
+ max_length=max_length,
812
+ stride=stride,
813
+ pad_to_multiple_of=pad_to_multiple_of,
814
+ padding_side=padding_side,
815
+ return_attention_mask=return_attention_mask,
816
+ return_overflowing_tokens=return_overflowing_tokens,
817
+ return_special_tokens_mask=return_special_tokens_mask,
818
+ return_length=return_length,
819
+ return_tensors=return_tensors,
820
+ verbose=verbose,
821
+ )
822
+
823
+ return BatchEncoding(batch_outputs)
824
+
825
+ def _get_all_special_ids(self) -> set[int]:
826
+ if self._tokenizer_type == MistralTokenizerType.tekken:
827
+ return {t["rank"] for t in self.tokenizer.instruct_tokenizer.tokenizer._all_special_tokens}
828
+ elif self._tokenizer_type == MistralTokenizerType.spm:
829
+ return self.tokenizer.instruct_tokenizer.tokenizer._control_tokens
830
+ else:
831
+ raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
832
+
833
+ def get_special_tokens_mask(
834
+ self, token_ids_0: list, token_ids_1: None = None, already_has_special_tokens: bool = False
835
+ ) -> list[int]:
871
836
  """
872
- Prepares a sequence of input id so that it can be used by the model. It
873
- adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
874
- manages a moving window (with user defined stride) for overflowing tokens.
837
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
838
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
875
839
 
876
840
  Args:
877
- ids (`list[int]`):
878
- Tokenized input ids of the first sequence.
879
- pair_ids (`None`, *optional*):
841
+ token_ids_0 (`list[int]`):
842
+ List of ids of the sequence.
843
+ token_ids_1 (`list[int]`, *optional*):
880
844
  Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
845
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
846
+ Whether or not the token list is already formatted with special tokens for the model.
847
+
848
+ Returns:
849
+ A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
881
850
  """
882
- if return_offsets_mapping or split_special_tokens:
851
+ if token_ids_1 is not None:
883
852
  raise ValueError(
884
- "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
853
+ "`token_ids_1` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
885
854
  )
886
-
887
- if pair_ids is not None:
855
+ if already_has_special_tokens:
888
856
  raise ValueError(
889
- "`pair_ids` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
857
+ "`already_has_special_tokens` is not supported by `MistralCommonBackend` and should be `False`."
890
858
  )
891
859
 
892
- if kwargs:
893
- raise ValueError(
894
- f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.prepare_for_model`."
895
- )
860
+ special_tokens_mask = [1 if token in self._all_special_tokens_ids else 0 for token in token_ids_0]
861
+ return special_tokens_mask
896
862
 
897
- padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
863
+ def _batch_prepare_for_model(
864
+ self,
865
+ batch_ids: list[PreTokenizedInput | list[int]],
866
+ add_special_tokens: bool = True,
867
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
868
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
869
+ max_length: int | None = None,
870
+ stride: int = 0,
871
+ pad_to_multiple_of: int | None = None,
872
+ padding_side: str | None = None,
873
+ return_tensors: str | None = None,
874
+ return_attention_mask: bool | None = None,
875
+ return_overflowing_tokens: bool = False,
876
+ return_special_tokens_mask: bool = False,
877
+ return_length: bool = False,
878
+ verbose: bool = True,
879
+ ) -> BatchEncoding:
880
+ """
881
+ Prepares a sequence of input id so that it can be used by the model. It
882
+ adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
883
+ manages a moving window (with user defined stride) for overflowing tokens.
884
+
885
+ Args:
886
+ batch_ids: list of tokenized input ids
887
+ """
888
+
889
+ batch_outputs = {}
890
+ for ids in batch_ids:
891
+ outputs = self.prepare_for_model(
892
+ ids,
893
+ add_special_tokens=add_special_tokens,
894
+ padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
895
+ truncation=truncation_strategy.value,
896
+ max_length=max_length,
897
+ stride=stride,
898
+ pad_to_multiple_of=None, # we pad in batch afterward
899
+ padding_side=None, # we pad in batch afterward
900
+ return_attention_mask=False, # we pad in batch afterward
901
+ return_overflowing_tokens=return_overflowing_tokens,
902
+ return_special_tokens_mask=return_special_tokens_mask,
903
+ return_length=return_length,
904
+ return_tensors=None, # We convert the whole batch to tensors at the end
905
+ prepend_batch_axis=False,
906
+ verbose=verbose,
907
+ )
908
+
909
+ for key, value in outputs.items():
910
+ if key not in batch_outputs:
911
+ batch_outputs[key] = []
912
+ batch_outputs[key].append(value)
913
+
914
+ batch_outputs = self.pad(
915
+ batch_outputs,
916
+ padding=padding_strategy.value,
917
+ max_length=max_length,
918
+ pad_to_multiple_of=pad_to_multiple_of,
919
+ padding_side=padding_side,
920
+ return_attention_mask=return_attention_mask,
921
+ )
922
+
923
+ batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
924
+
925
+ return batch_outputs
926
+
927
+ @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
928
+ def prepare_for_model(
929
+ self,
930
+ ids: list[int],
931
+ pair_ids: None = None,
932
+ add_special_tokens: bool = True,
933
+ padding: bool | str | PaddingStrategy = False,
934
+ truncation: bool | str | TruncationStrategy | None = None,
935
+ max_length: int | None = None,
936
+ stride: int = 0,
937
+ pad_to_multiple_of: int | None = None,
938
+ padding_side: str | None = None,
939
+ return_tensors: str | TensorType | None = None,
940
+ return_attention_mask: bool | None = None,
941
+ return_overflowing_tokens: bool = False,
942
+ return_special_tokens_mask: bool = False,
943
+ return_length: bool = False,
944
+ verbose: bool = True,
945
+ prepend_batch_axis: bool = False,
946
+ **kwargs,
947
+ ) -> BatchEncoding:
948
+ """
949
+ Prepares a sequence of input id so that it can be used by the model. It
950
+ adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
951
+ manages a moving window (with user defined stride) for overflowing tokens.
952
+
953
+ Args:
954
+ ids (`list[int]`):
955
+ Tokenized input ids of the first sequence.
956
+ pair_ids (`None`, *optional*):
957
+ Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
958
+ """
959
+ if pair_ids is not None:
960
+ raise ValueError(
961
+ "`pair_ids` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
962
+ )
963
+ if kwargs:
964
+ raise ValueError(
965
+ f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.prepare_for_model`."
966
+ )
967
+
968
+ padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
898
969
  padding=padding,
899
970
  truncation=truncation,
900
971
  max_length=max_length,
901
972
  pad_to_multiple_of=pad_to_multiple_of,
902
973
  verbose=verbose,
903
- **kwargs,
904
974
  )
905
975
 
906
- # Validation
907
- if (
908
- return_overflowing_tokens
909
- and truncation_strategy == TruncationStrategy.LONGEST_FIRST
910
- and pair_ids is not None
911
- ):
912
- raise ValueError(
913
- "Not possible to return overflowing tokens for pair of sequences with the "
914
- "`longest_first`. Please select another truncation strategy than `longest_first`, "
915
- "for instance `only_second` or `only_first`."
916
- )
976
+ len_ids = len(ids)
917
977
 
918
- # Defaults
919
- if return_token_type_ids is None:
920
- return_token_type_ids = "token_type_ids" in self.model_input_names
978
+ # Load from model defaults
921
979
  if return_attention_mask is None:
922
980
  return_attention_mask = "attention_mask" in self.model_input_names
923
981
 
924
- # Truncation
925
- num_special = self.num_special_tokens_to_add(pair=False) if add_special_tokens else 0
926
- total_len = len(ids) + len(pair_ids or []) + num_special
982
+ encoded_inputs = {}
927
983
 
984
+ # Truncation: Handle max sequence length
928
985
  overflowing_tokens = []
929
- if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
986
+ if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and len_ids > max_length:
930
987
  ids, _, overflowing_tokens = self.truncate_sequences(
931
988
  ids,
932
- pair_ids=None,
933
- num_tokens_to_remove=total_len - max_length,
989
+ num_tokens_to_remove=len_ids - max_length,
934
990
  truncation_strategy=truncation_strategy,
935
991
  stride=stride,
936
992
  )
937
993
 
938
- # Add special tokens
939
- if add_special_tokens:
940
- sequence = self.build_inputs_with_special_tokens(ids, None)
941
- token_type_ids = self.create_token_type_ids_from_sequences(ids, None)
942
- else:
943
- sequence = ids
944
- token_type_ids = [0] * len(sequence)
945
-
946
- # Build output
947
- encoded_inputs = {"input_ids": sequence}
948
- if return_token_type_ids:
949
- encoded_inputs["token_type_ids"] = token_type_ids
950
- if return_special_tokens_mask:
951
- encoded_inputs["special_tokens_mask"] = (
952
- self.get_special_tokens_mask(ids, None) if add_special_tokens else [0] * len(sequence)
953
- )
954
- if return_overflowing_tokens and not return_tensors and overflowing_tokens:
994
+ if return_overflowing_tokens:
955
995
  encoded_inputs["overflowing_tokens"] = overflowing_tokens
956
- encoded_inputs["num_truncated_tokens"] = total_len - max_length if max_length else 0
996
+ encoded_inputs["num_truncated_tokens"] = len_ids - max_length
957
997
 
958
- # Check sequence length and warn if needed
959
- self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
998
+ # Build output dictionary
999
+ encoded_inputs[self.model_input_names[0]] = ids
1000
+ if return_special_tokens_mask:
1001
+ if add_special_tokens:
1002
+ encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, None)
1003
+ else:
1004
+ encoded_inputs["special_tokens_mask"] = [0] * len(ids)
960
1005
 
961
- # Pad
1006
+ # Padding
962
1007
  if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
963
1008
  encoded_inputs = self.pad(
964
1009
  encoded_inputs,
@@ -972,9 +1017,362 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
972
1017
  if return_length:
973
1018
  encoded_inputs["length"] = len(encoded_inputs["input_ids"])
974
1019
 
975
- return BatchEncoding(encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis)
1020
+ batch_outputs = BatchEncoding(
1021
+ encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
1022
+ )
1023
+
1024
+ return batch_outputs
976
1025
 
977
- def truncate_sequences( # type: ignore[override]
1026
+ def _get_padding_truncation_strategies(
1027
+ self,
1028
+ padding: str | PaddingStrategy | bool = False,
1029
+ truncation: str | TruncationStrategy | bool | None = None,
1030
+ max_length: int | None = None,
1031
+ pad_to_multiple_of: int | None = None,
1032
+ verbose: bool = True,
1033
+ **kwargs,
1034
+ ):
1035
+ """
1036
+ Find the correct padding/truncation strategy.
1037
+ """
1038
+
1039
+ # Backward compatibility for previous behavior, maybe we should deprecate it:
1040
+ # If you only set max_length, it activates truncation for max_length
1041
+ if max_length is not None and padding is False and truncation is None:
1042
+ if verbose:
1043
+ if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
1044
+ logger.warning(
1045
+ "Truncation was not explicitly activated but `max_length` is provided a specific value, please"
1046
+ " use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
1047
+ " 'longest_first' truncation strategy."
1048
+ )
1049
+ self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
1050
+ truncation = "longest_first"
1051
+
1052
+ # Get padding strategy
1053
+ if padding is not False:
1054
+ if padding is True:
1055
+ if verbose:
1056
+ if max_length is not None and (
1057
+ truncation is None or truncation is False or truncation == "do_not_truncate"
1058
+ ):
1059
+ warnings.warn(
1060
+ "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
1061
+ "To pad to max length, use `padding='max_length'`."
1062
+ )
1063
+ padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
1064
+ elif not isinstance(padding, PaddingStrategy):
1065
+ padding_strategy = PaddingStrategy(padding)
1066
+ elif isinstance(padding, PaddingStrategy):
1067
+ padding_strategy = padding
1068
+ else:
1069
+ padding_strategy = PaddingStrategy.DO_NOT_PAD
1070
+
1071
+ # Get truncation strategy
1072
+ if truncation is not False and truncation is not None:
1073
+ if truncation is True:
1074
+ truncation_strategy = (
1075
+ TruncationStrategy.LONGEST_FIRST
1076
+ ) # Default to truncate the longest sequences in pairs of inputs
1077
+ elif not isinstance(truncation, TruncationStrategy):
1078
+ truncation_strategy = TruncationStrategy(truncation)
1079
+ elif isinstance(truncation, TruncationStrategy):
1080
+ truncation_strategy = truncation
1081
+ if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]:
1082
+ raise ValueError(
1083
+ "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
1084
+ )
1085
+ else:
1086
+ truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
1087
+
1088
+ # Set max length if needed
1089
+ if max_length is None:
1090
+ if padding_strategy == PaddingStrategy.MAX_LENGTH:
1091
+ if self.model_max_length > LARGE_INTEGER:
1092
+ if verbose:
1093
+ if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
1094
+ logger.warning(
1095
+ "Asking to pad to max_length but no maximum length is provided and the model has no"
1096
+ " predefined maximum length. Default to no padding."
1097
+ )
1098
+ self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
1099
+ padding_strategy = PaddingStrategy.DO_NOT_PAD
1100
+ else:
1101
+ max_length = self.model_max_length
1102
+
1103
+ if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
1104
+ if self.model_max_length > LARGE_INTEGER:
1105
+ if verbose:
1106
+ if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
1107
+ logger.warning(
1108
+ "Asking to truncate to max_length but no maximum length is provided and the model has"
1109
+ " no predefined maximum length. Default to no truncation."
1110
+ )
1111
+ self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
1112
+ truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
1113
+ else:
1114
+ max_length = self.model_max_length
1115
+
1116
+ # Test if we have a padding token
1117
+ if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token is None or self.pad_token_id < 0):
1118
+ raise ValueError(
1119
+ "Asking to pad but the tokenizer does not have a padding token. "
1120
+ "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
1121
+ "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
1122
+ )
1123
+
1124
+ # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
1125
+ if (
1126
+ truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
1127
+ and padding_strategy != PaddingStrategy.DO_NOT_PAD
1128
+ and pad_to_multiple_of is not None
1129
+ and max_length is not None
1130
+ and (max_length % pad_to_multiple_of != 0)
1131
+ ):
1132
+ raise ValueError(
1133
+ "Truncation and padding are both activated but "
1134
+ f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
1135
+ )
1136
+
1137
+ return padding_strategy, truncation_strategy, max_length, kwargs
1138
+
1139
+ def _pad(
1140
+ self,
1141
+ encoded_inputs: dict[str, EncodedInput] | BatchEncoding,
1142
+ max_length: int | None = None,
1143
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
1144
+ pad_to_multiple_of: int | None = None,
1145
+ padding_side: str | None = None,
1146
+ return_attention_mask: bool | None = None,
1147
+ ) -> dict:
1148
+ """
1149
+ Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
1150
+
1151
+ Args:
1152
+ encoded_inputs:
1153
+ Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
1154
+ max_length: maximum length of the returned list and optionally padding length (see below).
1155
+ Will truncate by taking into account the special tokens.
1156
+ padding_strategy: PaddingStrategy to use for padding.
1157
+
1158
+ - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
1159
+ - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
1160
+ - PaddingStrategy.DO_NOT_PAD: Do not pad
1161
+ The tokenizer padding sides are defined in `padding_side` argument:
1162
+
1163
+ - 'left': pads on the left of the sequences
1164
+ - 'right': pads on the right of the sequences
1165
+ pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
1166
+ This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
1167
+ `>= 7.5` (Volta).
1168
+ padding_side:
1169
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
1170
+ Default value is picked from the class attribute of the same name.
1171
+ return_attention_mask:
1172
+ (optional) Set to False to avoid returning attention mask (default: set to model specifics)
1173
+ """
1174
+ # Load from model defaults
1175
+ if return_attention_mask is None:
1176
+ return_attention_mask = "attention_mask" in self.model_input_names
1177
+
1178
+ required_input = encoded_inputs[self.model_input_names[0]]
1179
+
1180
+ if padding_strategy == PaddingStrategy.LONGEST:
1181
+ max_length = len(required_input)
1182
+
1183
+ if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
1184
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
1185
+
1186
+ needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
1187
+
1188
+ # Initialize attention mask if not present.
1189
+ if return_attention_mask and "attention_mask" not in encoded_inputs:
1190
+ encoded_inputs["attention_mask"] = [1] * len(required_input)
1191
+
1192
+ if needs_to_be_padded:
1193
+ difference = max_length - len(required_input)
1194
+ padding_side = padding_side if padding_side is not None else self.padding_side
1195
+
1196
+ if padding_side == "right":
1197
+ if return_attention_mask:
1198
+ encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
1199
+ if "special_tokens_mask" in encoded_inputs:
1200
+ encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
1201
+ encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
1202
+ elif padding_side == "left":
1203
+ if return_attention_mask:
1204
+ encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
1205
+ if "special_tokens_mask" in encoded_inputs:
1206
+ encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
1207
+ encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
1208
+ else:
1209
+ raise ValueError(f"Invalid padding strategy:{padding_side}")
1210
+
1211
+ return encoded_inputs
1212
+
1213
+ def pad(
1214
+ self,
1215
+ encoded_inputs: BatchEncoding
1216
+ | list[BatchEncoding]
1217
+ | dict[str, EncodedInput]
1218
+ | dict[str, list[EncodedInput]]
1219
+ | list[dict[str, EncodedInput]],
1220
+ padding: bool | str | PaddingStrategy = True,
1221
+ max_length: int | None = None,
1222
+ pad_to_multiple_of: int | None = None,
1223
+ padding_side: str | None = None,
1224
+ return_attention_mask: bool | None = None,
1225
+ return_tensors: str | TensorType | None = None,
1226
+ verbose: bool = True,
1227
+ ) -> BatchEncoding:
1228
+ """
1229
+ Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
1230
+ in the batch.
1231
+
1232
+ Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
1233
+ `self.pad_token_id`).
1234
+ <Tip>
1235
+
1236
+ If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors, the
1237
+ result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
1238
+ PyTorch tensors, you will lose the specific device of your tensors however.
1239
+
1240
+ </Tip>
1241
+
1242
+ Args:
1243
+ encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, list[int]]`, `Dict[str, list[list[int]]` or `List[Dict[str, list[int]]]`):
1244
+ Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, list[int]]`) or a batch of
1245
+ tokenized inputs (list of [`BatchEncoding`], *Dict[str, list[list[int]]]* or *List[Dict[str,
1246
+ list[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
1247
+ collate function.
1248
+
1249
+ Instead of `list[int]` you can have tensors (numpy arrays, PyTorch tensors), see
1250
+ the note above for the return type.
1251
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
1252
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
1253
+ index) among:
1254
+
1255
+ - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
1256
+ sequence if provided).
1257
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
1258
+ acceptable input length for the model if that argument is not provided.
1259
+ - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different
1260
+ lengths).
1261
+ max_length (`int`, *optional*):
1262
+ Maximum length of the returned list and optionally padding length (see above).
1263
+ pad_to_multiple_of (`int`, *optional*):
1264
+ If set will pad the sequence to a multiple of the provided value.
1265
+
1266
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
1267
+ `>= 7.5` (Volta).
1268
+ padding_side (`str`, *optional*):
1269
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
1270
+ Default value is picked from the class attribute of the same name.
1271
+ return_attention_mask (`bool`, *optional*):
1272
+ Whether to return the attention mask. If left to the default, will return the attention mask according
1273
+ to the specific tokenizer's default, defined by the `return_outputs` attribute.
1274
+
1275
+ [What are attention masks?](../glossary#attention-mask)
1276
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
1277
+ If set, will return tensors instead of list of python integers. Acceptable values are:
1278
+
1279
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
1280
+ - `'np'`: Return Numpy `np.ndarray` objects.
1281
+ verbose (`bool`, *optional*, defaults to `True`):
1282
+ Whether or not to print more information and warnings.
1283
+ """
1284
+ # If we have a list of dicts, let's convert it in a dict of lists
1285
+ # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
1286
+ if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
1287
+ # Call .keys() explicitly for compatibility with TensorDict and other Mapping subclasses
1288
+ encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
1289
+
1290
+ # The model's main input name, usually `input_ids`, has been passed for padding
1291
+ if self.model_input_names[0] not in encoded_inputs:
1292
+ raise ValueError(
1293
+ "You should supply an encoding or a list of encodings to this method "
1294
+ f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
1295
+ )
1296
+
1297
+ required_input = encoded_inputs[self.model_input_names[0]]
1298
+
1299
+ if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
1300
+ if return_attention_mask:
1301
+ encoded_inputs["attention_mask"] = []
1302
+ return encoded_inputs
1303
+
1304
+ # If we have PyTorch/NumPy tensors/arrays as inputs, we cast them as python objects
1305
+ # and rebuild them afterwards if no return_tensors is specified
1306
+ # Note that we lose the specific device the tensor may be on for PyTorch
1307
+
1308
+ first_element = required_input[0]
1309
+ if isinstance(first_element, (list, tuple)):
1310
+ # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
1311
+ for item in required_input:
1312
+ if len(item) != 0:
1313
+ first_element = item[0]
1314
+ break
1315
+ # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
1316
+ if not isinstance(first_element, (int, list, tuple)):
1317
+ if is_torch_tensor(first_element):
1318
+ return_tensors = "pt" if return_tensors is None else return_tensors
1319
+ elif isinstance(first_element, np.ndarray):
1320
+ return_tensors = "np" if return_tensors is None else return_tensors
1321
+ else:
1322
+ raise ValueError(
1323
+ f"type of {first_element} unknown: {type(first_element)}. "
1324
+ "Should be one of a python, numpy, or pytorch object."
1325
+ )
1326
+
1327
+ for key, value in encoded_inputs.items():
1328
+ encoded_inputs[key] = to_py_obj(value)
1329
+
1330
+ # Convert padding_strategy in PaddingStrategy
1331
+ padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
1332
+ padding=padding, max_length=max_length, verbose=verbose
1333
+ )
1334
+
1335
+ required_input = encoded_inputs[self.model_input_names[0]]
1336
+ if required_input and not isinstance(required_input[0], (list, tuple)):
1337
+ encoded_inputs = self._pad(
1338
+ encoded_inputs,
1339
+ max_length=max_length,
1340
+ padding_strategy=padding_strategy,
1341
+ pad_to_multiple_of=pad_to_multiple_of,
1342
+ padding_side=padding_side,
1343
+ return_attention_mask=return_attention_mask,
1344
+ )
1345
+ return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
1346
+
1347
+ batch_size = len(required_input)
1348
+ assert all(len(v) == batch_size for v in encoded_inputs.values()), (
1349
+ "Some items in the output dictionary have a different batch size than others."
1350
+ )
1351
+
1352
+ if padding_strategy == PaddingStrategy.LONGEST:
1353
+ max_length = max(len(inputs) for inputs in required_input)
1354
+ padding_strategy = PaddingStrategy.MAX_LENGTH
1355
+
1356
+ batch_outputs = {}
1357
+ for i in range(batch_size):
1358
+ inputs = {k: v[i] for k, v in encoded_inputs.items()}
1359
+ outputs = self._pad(
1360
+ inputs,
1361
+ max_length=max_length,
1362
+ padding_strategy=padding_strategy,
1363
+ pad_to_multiple_of=pad_to_multiple_of,
1364
+ padding_side=padding_side,
1365
+ return_attention_mask=return_attention_mask,
1366
+ )
1367
+
1368
+ for key, value in outputs.items():
1369
+ if key not in batch_outputs:
1370
+ batch_outputs[key] = []
1371
+ batch_outputs[key].append(value)
1372
+
1373
+ return BatchEncoding(batch_outputs, tensor_type=return_tensors)
1374
+
1375
+ def truncate_sequences(
978
1376
  self,
979
1377
  ids: list[int],
980
1378
  pair_ids: None = None,
@@ -1009,36 +1407,47 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1009
1407
  `Tuple[list[int], None, list[int]]`: The truncated `ids` and the list of
1010
1408
  overflowing tokens. `None` is returned to match Transformers signature.
1011
1409
  """
1012
-
1410
+ if kwargs:
1411
+ raise ValueError(
1412
+ f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.truncate_sequences`."
1413
+ )
1013
1414
  if pair_ids:
1014
1415
  raise ValueError("`pair_ids` is not supported by `MistralCommonBackend.truncate_sequences`.")
1015
1416
 
1417
+ if num_tokens_to_remove <= 0:
1418
+ return (ids, None, [])
1419
+
1016
1420
  if not isinstance(truncation_strategy, TruncationStrategy):
1017
1421
  truncation_strategy = TruncationStrategy(truncation_strategy)
1018
1422
 
1019
- if truncation_strategy in [
1020
- TruncationStrategy.ONLY_FIRST,
1021
- TruncationStrategy.ONLY_SECOND,
1022
- ]:
1023
- raise ValueError(f"{truncation_strategy=} is not supported by `MistralCommonBackend`.")
1024
-
1025
- if num_tokens_to_remove <= 0:
1026
- return ids, None, []
1423
+ if truncation_strategy in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]:
1424
+ raise ValueError(
1425
+ f"Only {TruncationStrategy.LONGEST_FIRST} and {TruncationStrategy.DO_NOT_TRUNCATE} are supported."
1426
+ )
1027
1427
 
1028
1428
  overflowing_tokens = []
1029
-
1030
1429
  if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
1031
- window_len = min(len(ids), stride + num_tokens_to_remove)
1032
- if self.truncation_side == "left":
1033
- overflowing_tokens = ids[:window_len]
1034
- ids = ids[num_tokens_to_remove:]
1430
+ if len(ids) > num_tokens_to_remove:
1431
+ window_len = min(len(ids), stride + num_tokens_to_remove)
1432
+ if self.truncation_side == "left":
1433
+ overflowing_tokens = ids[:window_len]
1434
+ ids = ids[num_tokens_to_remove:]
1435
+ elif self.truncation_side == "right":
1436
+ overflowing_tokens = ids[-window_len:]
1437
+ ids = ids[:-num_tokens_to_remove]
1438
+ else:
1439
+ raise ValueError(f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.")
1440
+
1035
1441
  else:
1036
- overflowing_tokens = ids[-window_len:]
1037
- ids = ids[:-num_tokens_to_remove]
1442
+ error_msg = (
1443
+ f"We need to remove {num_tokens_to_remove} to truncate the input "
1444
+ f"but the first sequence has a length {len(ids)}. "
1445
+ )
1446
+ logger.error(error_msg)
1038
1447
 
1039
- return ids, None, overflowing_tokens
1448
+ return (ids, None, overflowing_tokens)
1040
1449
 
1041
- def apply_chat_template( # type: ignore[override]
1450
+ def apply_chat_template(
1042
1451
  self,
1043
1452
  conversation: list[dict[str, str]] | list[list[dict[str, str]]],
1044
1453
  tools: list[dict | Callable] | None = None,
@@ -1066,8 +1475,8 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1066
1475
  [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
1067
1476
  for more information.
1068
1477
  add_generation_prompt (`bool`, *optional*):
1069
- This argument is a no-op for `MistralCommonBackend`. However, it cannot be used at the same time as `continue_final_message` to keep the API consistent.
1070
- If any conversation ends with an assistant message, it will raise an error. In such cases, use `continue_final_message` instead.
1478
+ This argument is a no-op for `MistralCommonBackend`. However it cannot be used at the same time as `continue_final_message` to keep the API consistent and
1479
+ if any conversation ends with an assistant message, it will raise an error. In such case, use `continue_final_message` instead.
1071
1480
  continue_final_message (bool, *optional*):
1072
1481
  If this is set, the chat will be formatted so that the final
1073
1482
  message in the chat is open-ended, without any EOS tokens. The model will continue this message
@@ -1102,7 +1511,8 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1102
1511
  Will raise an error if used.
1103
1512
 
1104
1513
  Returns:
1105
- `Union[str, list[int], list[str], list[list[int]], BatchEncoding]`: The tokenized chat so far, including control tokens. This output is ready to pass to the model, either directly or via methods like `generate()`.
1514
+ `Union[str, list[int], list[str], list[list[int]], BatchEncoding]`: A list of token ids representing the tokenized chat so far, including control
1515
+ tokens. This output is ready to pass to the model, either directly or via methods like `generate()`.
1106
1516
  """
1107
1517
  if kwargs:
1108
1518
  raise ValueError(
@@ -1249,83 +1659,6 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1249
1659
  )
1250
1660
  return outputs
1251
1661
 
1252
- def build_inputs_with_special_tokens(self, token_ids_0: list[int], token_ids_1: None = None) -> list[int]:
1253
- """
1254
- Build model inputs from a sequence by adding special tokens.
1255
-
1256
- This method dynamically builds inputs based on the tokenizer's `mode`:
1257
- - `"test"`: seq0 [EOS]
1258
- - `"finetuning"`: [BOS] seq0
1259
-
1260
- Args:
1261
- token_ids_0 (`list[int]`):
1262
- List of IDs to which the special tokens will be added.
1263
- token_ids_1 (`None`, *optional*): None, kept to match Transformers' signature.
1264
-
1265
- Returns:
1266
- `list[int]`: List of input IDs with the appropriate special tokens.
1267
- """
1268
- if token_ids_1 is not None:
1269
- raise ValueError(
1270
- "`MistralCommonBackend` does not implement `token_ids_1 != None` for `build_inputs_with_special_tokens`."
1271
- )
1272
-
1273
- if self.mode == ValidationMode.test:
1274
- # [BOS] seq0
1275
- return [self.bos_token_id] + token_ids_0
1276
-
1277
- else:
1278
- # [BOS] seq0 [EOS]
1279
- return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
1280
-
1281
- def create_token_type_ids_from_sequences(self, token_ids_0: list[int], token_ids_1: None = None) -> list[int]:
1282
- """
1283
- Create a mask of zeroes from the token ids with special tokens added.
1284
-
1285
- Kept to match Transformers' implementation.
1286
-
1287
- Args:
1288
- token_ids_0 (`list[int]`):
1289
- List of IDs.
1290
- token_ids_1 (`None`, *optional*): None, kept to match Transformers' signature.
1291
-
1292
-
1293
- Returns:
1294
- `list[int]`: Token type IDs according to the configured pattern.
1295
- """
1296
- if token_ids_1 is not None:
1297
- raise ValueError(
1298
- "`MistralCommonBackend` does not implement `token_ids_1 != None` for `create_token_type_ids_from_sequences`."
1299
- )
1300
-
1301
- sequence = self.build_inputs_with_special_tokens(token_ids_0)
1302
-
1303
- return [0] * len(sequence)
1304
-
1305
- def num_special_tokens_to_add(self, pair: Literal[False] = False) -> int:
1306
- """
1307
- Returns the number of added tokens when encoding a sequence with special tokens.
1308
-
1309
- <Tip>
1310
-
1311
- This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
1312
- this inside your training loop.
1313
-
1314
- </Tip>
1315
-
1316
- Args:
1317
- pair (`Literal[False]`, *optional*): False, kept to match Transformer's signature.
1318
-
1319
- Returns:
1320
- `int`: Number of special tokens added to sequences.
1321
- """
1322
- if pair:
1323
- raise ValueError(
1324
- "`MistralCommonBackend` does not implement `pair = True` for `num_special_tokens_to_add`."
1325
- )
1326
-
1327
- return len(self.build_inputs_with_special_tokens([], None))
1328
-
1329
1662
  @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
1330
1663
  def __call__(
1331
1664
  self,
@@ -1346,8 +1679,6 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1346
1679
  return_special_tokens_mask: bool = False,
1347
1680
  return_length: bool = False,
1348
1681
  verbose: bool = True,
1349
- return_offsets_mapping: Literal[False] = False,
1350
- split_special_tokens: Literal[False] = False,
1351
1682
  **kwargs,
1352
1683
  ) -> BatchEncoding:
1353
1684
  """
@@ -1365,16 +1696,6 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1365
1696
  text_pair_target (`None`, *optional*):
1366
1697
  Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
1367
1698
  """
1368
- if return_offsets_mapping or split_special_tokens:
1369
- raise ValueError(
1370
- "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
1371
- )
1372
-
1373
- if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND, "only_first", "only_second"]:
1374
- raise ValueError(
1375
- "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
1376
- )
1377
-
1378
1699
  if kwargs:
1379
1700
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.__call__`.")
1380
1701
 
@@ -1383,31 +1704,84 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1383
1704
  "`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonBackend`."
1384
1705
  )
1385
1706
 
1386
- return super().__call__(
1387
- text=text,
1388
- text_pair=text_pair,
1389
- text_target=text_target,
1390
- add_special_tokens=add_special_tokens,
1707
+ def _is_valid_text_input(t):
1708
+ if isinstance(t, str):
1709
+ # Strings are fine
1710
+ return True
1711
+ elif isinstance(t, (list, tuple)):
1712
+ # List are fine as long as they are...
1713
+ if len(t) == 0:
1714
+ # ... empty
1715
+ return True
1716
+ elif isinstance(t[0], (str, int)):
1717
+ # ... list of strings or int
1718
+ return True
1719
+ elif isinstance(t[0], (list, tuple)):
1720
+ # ... list with an empty list or with a list of strings or with a list of ints
1721
+ return len(t[0]) == 0 or isinstance(t[0][0], (str, int))
1722
+ else:
1723
+ return False
1724
+ else:
1725
+ return False
1726
+
1727
+ if not _is_valid_text_input(text):
1728
+ raise ValueError(
1729
+ "text input must be of type `str` (single example), `list[str]` (batch or single encoded example) "
1730
+ "or `list[list[int]]` (batch of encoded examples)."
1731
+ )
1732
+
1733
+ is_batched = isinstance(text, (list, tuple)) and isinstance(text[0], (str, list, tuple))
1734
+
1735
+ padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
1391
1736
  padding=padding,
1392
1737
  truncation=truncation,
1393
1738
  max_length=max_length,
1394
- stride=stride,
1395
1739
  pad_to_multiple_of=pad_to_multiple_of,
1396
- padding_side=padding_side,
1397
- return_tensors=return_tensors,
1398
- return_attention_mask=return_attention_mask,
1399
- return_overflowing_tokens=return_overflowing_tokens,
1400
- return_special_tokens_mask=return_special_tokens_mask,
1401
- return_length=return_length,
1402
1740
  verbose=verbose,
1741
+ **kwargs,
1403
1742
  )
1404
1743
 
1744
+ if is_batched:
1745
+ return self._batch_encode_plus(
1746
+ batch_text=text,
1747
+ add_special_tokens=add_special_tokens,
1748
+ padding_strategy=padding_strategy,
1749
+ truncation_strategy=truncation_strategy,
1750
+ max_length=max_length,
1751
+ stride=stride,
1752
+ pad_to_multiple_of=pad_to_multiple_of,
1753
+ padding_side=padding_side,
1754
+ return_tensors=return_tensors,
1755
+ return_attention_mask=return_attention_mask,
1756
+ return_overflowing_tokens=return_overflowing_tokens,
1757
+ return_special_tokens_mask=return_special_tokens_mask,
1758
+ return_length=return_length,
1759
+ verbose=verbose,
1760
+ )
1761
+ else:
1762
+ return self._encode_plus(
1763
+ text=text,
1764
+ add_special_tokens=add_special_tokens,
1765
+ padding_strategy=padding_strategy,
1766
+ truncation_strategy=truncation_strategy,
1767
+ max_length=max_length,
1768
+ stride=stride,
1769
+ pad_to_multiple_of=pad_to_multiple_of,
1770
+ padding_side=padding_side,
1771
+ return_tensors=return_tensors,
1772
+ return_attention_mask=return_attention_mask,
1773
+ return_overflowing_tokens=return_overflowing_tokens,
1774
+ return_special_tokens_mask=return_special_tokens_mask,
1775
+ return_length=return_length,
1776
+ verbose=verbose,
1777
+ )
1778
+
1405
1779
  @classmethod
1406
1780
  def from_pretrained(
1407
1781
  cls,
1408
1782
  pretrained_model_name_or_path: str | os.PathLike,
1409
1783
  *init_inputs,
1410
- mode: str | ValidationMode = ValidationMode.test,
1784
+ mode: Union[str, ValidationMode] = ValidationMode.test,
1411
1785
  cache_dir: str | os.PathLike | None = None,
1412
1786
  force_download: bool = False,
1413
1787
  local_files_only: bool = False,
@@ -1434,9 +1808,9 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1434
1808
  `./my_model_directory/`.
1435
1809
  mode (`Union[str, ValidationMode]`, *optional*, defaults to `ValidationMode.test`):
1436
1810
  Validation mode for the `MistralTokenizer` tokenizer. Possible values are:
1437
- - `"finetuning"` or `ValidationMode.finetuning`: The fine-tuning mode.
1811
+ - `"finetuning"` or `ValidationMode.finetuning`: The finetuning mode.
1438
1812
  - `"test"` or `ValidationMode.test`: The test mode.
1439
- It changes how the tokenizer validates the input and prepares the request to the model.
1813
+ It changes how the tokenizer validates the input and prepare the request to the model.
1440
1814
  cache_dir (`str` or `os.PathLike`, *optional*):
1441
1815
  Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
1442
1816
  standard cache should not be used.
@@ -1463,11 +1837,11 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1463
1837
  Default value is picked from the class attribute of the same name.
1464
1838
  truncation_side (`str`, *optional*, defaults to `"right"`):
1465
1839
  The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
1466
- model_input_names (`List[str]`, *optional*):
1840
+ model_input_names (`List[string]`, *optional*):
1467
1841
  The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
1468
1842
  `"attention_mask"`). Default value is picked from the class attribute of the same name.
1469
1843
  clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
1470
- Whether or not the model should clean up the spaces that were added when splitting the input text during the
1844
+ Whether or not the model should cleanup the spaces that were added when splitting the input text during the
1471
1845
  tokenization process.
1472
1846
  kwargs (additional keyword arguments, *optional*):
1473
1847
  Not supported by `MistralCommonBackend.from_pretrained`.
@@ -1477,13 +1851,10 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1477
1851
  raise ValueError("`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`.")
1478
1852
 
1479
1853
  # Handle kwargs and AutoTokenizer/AutoProcessor case
1480
- valid_kwargs = _VALID_INIT_KWARGS.union(
1481
- {"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "subfolder"}
1482
- )
1483
- if kwargs and not set(kwargs.keys()).issubset(valid_kwargs):
1484
- raise ValueError(
1485
- f"Some kwargs in {list(kwargs.keys())} are not supported by `MistralCommonBackend.from_pretrained`."
1486
- )
1854
+ if kwargs and not set(kwargs.keys()).issubset(
1855
+ {"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "_from_auto"}
1856
+ ):
1857
+ raise ValueError(f"Some kwargs in {kwargs} are not supported by `MistralCommonBackend.from_pretrained`.")
1487
1858
 
1488
1859
  mode = cls._get_validation_mode(mode)
1489
1860
 
@@ -1497,8 +1868,35 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1497
1868
  local_files_only=local_files_only,
1498
1869
  )
1499
1870
  else:
1500
- candidate_files = os.listdir(pretrained_model_name_or_path)
1501
- tokenizer_path = os.path.join(pretrained_model_name_or_path, get_one_valid_tokenizer_file(candidate_files))
1871
+ valid_tokenizer_files = []
1872
+ tokenizer_file: str
1873
+
1874
+ instruct_versions = list(TokenizerVersion.__members__)
1875
+ mm_versions = list(MultiModalVersion.__members__) + [""] # allow no mm version
1876
+ sentencepiece_suffixes = [f".model.{v}{m}" for v in instruct_versions for m in mm_versions] + [".model"]
1877
+
1878
+ for path in os.listdir(pretrained_model_name_or_path):
1879
+ pathlib_repo_file = Path(path)
1880
+ file_name = pathlib_repo_file.name
1881
+ suffix = "".join(pathlib_repo_file.suffixes)
1882
+ if file_name == "tekken.json" or suffix in sentencepiece_suffixes:
1883
+ valid_tokenizer_files.append(file_name)
1884
+
1885
+ if len(valid_tokenizer_files) == 0:
1886
+ raise ValueError(f"No tokenizer file found in directory: {pretrained_model_name_or_path}")
1887
+ # If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
1888
+ if len(valid_tokenizer_files) > 1:
1889
+ if "tekken.json" in valid_tokenizer_files:
1890
+ tokenizer_file = "tekken.json"
1891
+ else:
1892
+ tokenizer_file = max(valid_tokenizer_files)
1893
+ logger.warning(
1894
+ f"Multiple tokenizer files found in directory: {pretrained_model_name_or_path}. Using {tokenizer_file}."
1895
+ )
1896
+ else:
1897
+ tokenizer_file = valid_tokenizer_files[0]
1898
+
1899
+ tokenizer_path = os.path.join(pretrained_model_name_or_path, tokenizer_file)
1502
1900
 
1503
1901
  return cls(
1504
1902
  tokenizer_path=tokenizer_path,
@@ -1510,7 +1908,7 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1510
1908
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
1511
1909
  )
1512
1910
 
1513
- def save_pretrained( # type: ignore[override]
1911
+ def save_pretrained(
1514
1912
  self,
1515
1913
  save_directory: str | os.PathLike | Path,
1516
1914
  push_to_hub: bool = False,
@@ -1572,7 +1970,7 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1572
1970
  return (str(save_directory / self._tokenizer_path.name),)
1573
1971
 
1574
1972
  @staticmethod
1575
- def _get_validation_mode(mode: str | ValidationMode) -> ValidationMode:
1973
+ def _get_validation_mode(mode: Union[str, ValidationMode]) -> ValidationMode:
1576
1974
  """Get the validation mode from a string or a ValidationMode."""
1577
1975
  _invalid_mode_msg = (
1578
1976
  f"Invalid `mistral-common` tokenizer mode: {mode}. Possible values are 'finetuning' or 'test'."
@@ -1588,66 +1986,3 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
1588
1986
  if mode not in [ValidationMode.finetuning, ValidationMode.test]:
1589
1987
  raise ValueError(_invalid_mode_msg)
1590
1988
  return mode
1591
-
1592
- def add_special_tokens(
1593
- self,
1594
- special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
1595
- replace_extra_special_tokens: bool = True,
1596
- ):
1597
- r"""`MistralCommonBackend` does not implement `add_special_tokens` by design.
1598
-
1599
- If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
1600
- """
1601
-
1602
- raise NotImplementedError("`MistralCommonBackend` does not implement `add_special_tokens`.")
1603
-
1604
- def add_tokens( # type: ignore[override]
1605
- self,
1606
- special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
1607
- replace_extra_special_tokens: bool = True,
1608
- ):
1609
- """
1610
- `MistralCommonBackend` does not implement `add_special_tokens` by design.
1611
-
1612
- If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
1613
- """
1614
-
1615
- raise NotImplementedError("`MistralCommonBackend` does not implement `add_tokens`.")
1616
-
1617
- def convert_added_tokens(cls, obj: AddedToken | Any, save: bool = False, add_type_field: bool = True): # type: ignore[override]
1618
- """
1619
- `MistralCommonBackend` does not implement `convert_added_tokens` by design.
1620
-
1621
- If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
1622
- """
1623
-
1624
- raise NotImplementedError("`MistralCommonBackend` does not implement `convert_added_tokens`.")
1625
-
1626
- def get_chat_template(self, chat_template: str | None = None, tools: list[dict] | None = None) -> str:
1627
- """`MistralCommonBackend` does not implement `get_chat_template` by design as `mistral-common` does not use chat templates."""
1628
-
1629
- raise NotImplementedError("`MistralCommonBackend` does not implement `get_chat_template`.")
1630
-
1631
- def save_chat_templates(
1632
- self,
1633
- save_directory: str | os.PathLike,
1634
- tokenizer_config: dict,
1635
- filename_prefix: str | None,
1636
- save_jinja_files: bool,
1637
- ):
1638
- """`MistralCommonBackend` does not implement `save_chat_templates` by design as `mistral-common` does not use chat templates."""
1639
-
1640
- raise NotImplementedError("`MistralCommonBackend` does not implement `save_chat_templates`.")
1641
-
1642
- def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str, ...]:
1643
- """
1644
- `MistralCommonBackend` does not implement `save_vocabulary` by design.
1645
-
1646
- This is because `mistral-common` is configured by one tokenizer file. If you'd like to save the vocabulary, please consider using the `save_pretrained` method instead.
1647
- """
1648
-
1649
- raise NotImplementedError("`MistralCommonBackend` does not implement `save_vocabulary`.")
1650
-
1651
-
1652
- # Backward compatibility alias for codebases still importing the legacy name.
1653
- MistralCommonTokenizer = MistralCommonBackend