transformers 5.0.0rc2__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1594) hide show
  1. transformers/__init__.py +11 -37
  2. transformers/activations.py +2 -2
  3. transformers/audio_utils.py +32 -32
  4. transformers/backbone_utils.py +326 -0
  5. transformers/cache_utils.py +26 -126
  6. transformers/cli/chat.py +3 -3
  7. transformers/cli/serve.py +13 -10
  8. transformers/cli/transformers.py +2 -1
  9. transformers/configuration_utils.py +22 -92
  10. transformers/conversion_mapping.py +150 -26
  11. transformers/convert_slow_tokenizer.py +9 -12
  12. transformers/core_model_loading.py +217 -129
  13. transformers/data/processors/glue.py +0 -1
  14. transformers/data/processors/utils.py +0 -1
  15. transformers/data/processors/xnli.py +0 -1
  16. transformers/dependency_versions_check.py +0 -1
  17. transformers/dependency_versions_table.py +10 -11
  18. transformers/distributed/configuration_utils.py +1 -2
  19. transformers/dynamic_module_utils.py +23 -23
  20. transformers/feature_extraction_sequence_utils.py +19 -23
  21. transformers/feature_extraction_utils.py +14 -14
  22. transformers/file_utils.py +0 -2
  23. transformers/generation/candidate_generator.py +2 -4
  24. transformers/generation/configuration_utils.py +54 -39
  25. transformers/generation/continuous_batching/__init__.py +0 -1
  26. transformers/generation/continuous_batching/cache.py +74 -44
  27. transformers/generation/continuous_batching/cache_manager.py +28 -28
  28. transformers/generation/continuous_batching/continuous_api.py +133 -414
  29. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  30. transformers/generation/continuous_batching/requests.py +77 -19
  31. transformers/generation/continuous_batching/scheduler.py +154 -104
  32. transformers/generation/logits_process.py +10 -133
  33. transformers/generation/stopping_criteria.py +1 -2
  34. transformers/generation/streamers.py +0 -1
  35. transformers/generation/utils.py +91 -121
  36. transformers/generation/watermarking.py +2 -3
  37. transformers/hf_argparser.py +9 -13
  38. transformers/hyperparameter_search.py +1 -2
  39. transformers/image_processing_base.py +9 -9
  40. transformers/image_processing_utils.py +11 -15
  41. transformers/image_processing_utils_fast.py +70 -71
  42. transformers/image_transforms.py +73 -42
  43. transformers/image_utils.py +30 -37
  44. transformers/initialization.py +57 -0
  45. transformers/integrations/__init__.py +10 -24
  46. transformers/integrations/accelerate.py +47 -11
  47. transformers/integrations/awq.py +1 -3
  48. transformers/integrations/deepspeed.py +146 -4
  49. transformers/integrations/eetq.py +0 -1
  50. transformers/integrations/executorch.py +2 -6
  51. transformers/integrations/fbgemm_fp8.py +1 -2
  52. transformers/integrations/finegrained_fp8.py +149 -13
  53. transformers/integrations/flash_attention.py +3 -8
  54. transformers/integrations/flex_attention.py +1 -1
  55. transformers/integrations/fp_quant.py +4 -6
  56. transformers/integrations/ggml.py +0 -1
  57. transformers/integrations/hub_kernels.py +18 -7
  58. transformers/integrations/integration_utils.py +2 -3
  59. transformers/integrations/moe.py +226 -106
  60. transformers/integrations/mxfp4.py +52 -40
  61. transformers/integrations/peft.py +488 -176
  62. transformers/integrations/quark.py +2 -4
  63. transformers/integrations/tensor_parallel.py +641 -581
  64. transformers/integrations/torchao.py +4 -6
  65. transformers/loss/loss_lw_detr.py +356 -0
  66. transformers/loss/loss_utils.py +2 -0
  67. transformers/masking_utils.py +199 -59
  68. transformers/model_debugging_utils.py +4 -5
  69. transformers/modelcard.py +14 -192
  70. transformers/modeling_attn_mask_utils.py +19 -19
  71. transformers/modeling_flash_attention_utils.py +28 -29
  72. transformers/modeling_gguf_pytorch_utils.py +5 -5
  73. transformers/modeling_layers.py +21 -22
  74. transformers/modeling_outputs.py +242 -253
  75. transformers/modeling_rope_utils.py +32 -32
  76. transformers/modeling_utils.py +416 -438
  77. transformers/models/__init__.py +10 -0
  78. transformers/models/afmoe/configuration_afmoe.py +40 -33
  79. transformers/models/afmoe/modeling_afmoe.py +38 -41
  80. transformers/models/afmoe/modular_afmoe.py +23 -25
  81. transformers/models/aimv2/configuration_aimv2.py +2 -10
  82. transformers/models/aimv2/modeling_aimv2.py +46 -45
  83. transformers/models/aimv2/modular_aimv2.py +13 -19
  84. transformers/models/albert/configuration_albert.py +8 -2
  85. transformers/models/albert/modeling_albert.py +70 -72
  86. transformers/models/albert/tokenization_albert.py +1 -4
  87. transformers/models/align/configuration_align.py +8 -6
  88. transformers/models/align/modeling_align.py +83 -86
  89. transformers/models/align/processing_align.py +2 -30
  90. transformers/models/altclip/configuration_altclip.py +4 -7
  91. transformers/models/altclip/modeling_altclip.py +106 -103
  92. transformers/models/altclip/processing_altclip.py +2 -15
  93. transformers/models/apertus/__init__.py +0 -1
  94. transformers/models/apertus/configuration_apertus.py +23 -28
  95. transformers/models/apertus/modeling_apertus.py +35 -38
  96. transformers/models/apertus/modular_apertus.py +36 -40
  97. transformers/models/arcee/configuration_arcee.py +25 -30
  98. transformers/models/arcee/modeling_arcee.py +35 -38
  99. transformers/models/arcee/modular_arcee.py +20 -23
  100. transformers/models/aria/configuration_aria.py +31 -44
  101. transformers/models/aria/image_processing_aria.py +25 -27
  102. transformers/models/aria/modeling_aria.py +102 -102
  103. transformers/models/aria/modular_aria.py +111 -124
  104. transformers/models/aria/processing_aria.py +28 -35
  105. transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +0 -1
  106. transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +3 -6
  107. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +9 -11
  108. transformers/models/audioflamingo3/__init__.py +0 -1
  109. transformers/models/audioflamingo3/configuration_audioflamingo3.py +0 -1
  110. transformers/models/audioflamingo3/modeling_audioflamingo3.py +60 -52
  111. transformers/models/audioflamingo3/modular_audioflamingo3.py +52 -43
  112. transformers/models/audioflamingo3/processing_audioflamingo3.py +6 -8
  113. transformers/models/auto/auto_factory.py +12 -11
  114. transformers/models/auto/configuration_auto.py +48 -5
  115. transformers/models/auto/feature_extraction_auto.py +5 -7
  116. transformers/models/auto/image_processing_auto.py +30 -39
  117. transformers/models/auto/modeling_auto.py +33 -199
  118. transformers/models/auto/processing_auto.py +11 -19
  119. transformers/models/auto/tokenization_auto.py +38 -37
  120. transformers/models/auto/video_processing_auto.py +7 -8
  121. transformers/models/autoformer/configuration_autoformer.py +4 -7
  122. transformers/models/autoformer/modeling_autoformer.py +100 -101
  123. transformers/models/aya_vision/configuration_aya_vision.py +4 -1
  124. transformers/models/aya_vision/modeling_aya_vision.py +64 -99
  125. transformers/models/aya_vision/modular_aya_vision.py +46 -74
  126. transformers/models/aya_vision/processing_aya_vision.py +25 -53
  127. transformers/models/bamba/configuration_bamba.py +46 -39
  128. transformers/models/bamba/modeling_bamba.py +83 -119
  129. transformers/models/bamba/modular_bamba.py +70 -109
  130. transformers/models/bark/configuration_bark.py +6 -8
  131. transformers/models/bark/generation_configuration_bark.py +3 -5
  132. transformers/models/bark/modeling_bark.py +64 -65
  133. transformers/models/bark/processing_bark.py +19 -41
  134. transformers/models/bart/configuration_bart.py +9 -5
  135. transformers/models/bart/modeling_bart.py +124 -129
  136. transformers/models/barthez/tokenization_barthez.py +1 -4
  137. transformers/models/bartpho/tokenization_bartpho.py +6 -7
  138. transformers/models/beit/configuration_beit.py +2 -15
  139. transformers/models/beit/image_processing_beit.py +53 -56
  140. transformers/models/beit/image_processing_beit_fast.py +11 -12
  141. transformers/models/beit/modeling_beit.py +65 -62
  142. transformers/models/bert/configuration_bert.py +12 -2
  143. transformers/models/bert/modeling_bert.py +117 -152
  144. transformers/models/bert/tokenization_bert.py +2 -4
  145. transformers/models/bert/tokenization_bert_legacy.py +3 -5
  146. transformers/models/bert_generation/configuration_bert_generation.py +17 -2
  147. transformers/models/bert_generation/modeling_bert_generation.py +53 -55
  148. transformers/models/bert_generation/tokenization_bert_generation.py +2 -3
  149. transformers/models/bert_japanese/tokenization_bert_japanese.py +5 -6
  150. transformers/models/bertweet/tokenization_bertweet.py +1 -3
  151. transformers/models/big_bird/configuration_big_bird.py +12 -9
  152. transformers/models/big_bird/modeling_big_bird.py +107 -124
  153. transformers/models/big_bird/tokenization_big_bird.py +1 -4
  154. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -9
  155. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +118 -118
  156. transformers/models/biogpt/configuration_biogpt.py +8 -2
  157. transformers/models/biogpt/modeling_biogpt.py +73 -79
  158. transformers/models/biogpt/modular_biogpt.py +60 -66
  159. transformers/models/biogpt/tokenization_biogpt.py +3 -5
  160. transformers/models/bit/configuration_bit.py +2 -5
  161. transformers/models/bit/image_processing_bit.py +21 -24
  162. transformers/models/bit/image_processing_bit_fast.py +0 -1
  163. transformers/models/bit/modeling_bit.py +15 -16
  164. transformers/models/bitnet/configuration_bitnet.py +23 -28
  165. transformers/models/bitnet/modeling_bitnet.py +34 -38
  166. transformers/models/bitnet/modular_bitnet.py +7 -10
  167. transformers/models/blenderbot/configuration_blenderbot.py +8 -5
  168. transformers/models/blenderbot/modeling_blenderbot.py +68 -99
  169. transformers/models/blenderbot/tokenization_blenderbot.py +0 -1
  170. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -5
  171. transformers/models/blenderbot_small/modeling_blenderbot_small.py +70 -72
  172. transformers/models/blenderbot_small/tokenization_blenderbot_small.py +1 -3
  173. transformers/models/blip/configuration_blip.py +9 -10
  174. transformers/models/blip/image_processing_blip.py +17 -20
  175. transformers/models/blip/image_processing_blip_fast.py +0 -1
  176. transformers/models/blip/modeling_blip.py +115 -108
  177. transformers/models/blip/modeling_blip_text.py +63 -65
  178. transformers/models/blip/processing_blip.py +5 -36
  179. transformers/models/blip_2/configuration_blip_2.py +2 -2
  180. transformers/models/blip_2/modeling_blip_2.py +145 -121
  181. transformers/models/blip_2/processing_blip_2.py +8 -38
  182. transformers/models/bloom/configuration_bloom.py +5 -2
  183. transformers/models/bloom/modeling_bloom.py +60 -60
  184. transformers/models/blt/configuration_blt.py +94 -86
  185. transformers/models/blt/modeling_blt.py +93 -90
  186. transformers/models/blt/modular_blt.py +127 -69
  187. transformers/models/bridgetower/configuration_bridgetower.py +7 -2
  188. transformers/models/bridgetower/image_processing_bridgetower.py +34 -35
  189. transformers/models/bridgetower/image_processing_bridgetower_fast.py +13 -14
  190. transformers/models/bridgetower/modeling_bridgetower.py +136 -124
  191. transformers/models/bridgetower/processing_bridgetower.py +2 -16
  192. transformers/models/bros/configuration_bros.py +24 -18
  193. transformers/models/bros/modeling_bros.py +78 -80
  194. transformers/models/bros/processing_bros.py +2 -12
  195. transformers/models/byt5/tokenization_byt5.py +4 -6
  196. transformers/models/camembert/configuration_camembert.py +8 -2
  197. transformers/models/camembert/modeling_camembert.py +97 -99
  198. transformers/models/camembert/modular_camembert.py +51 -54
  199. transformers/models/camembert/tokenization_camembert.py +1 -4
  200. transformers/models/canine/configuration_canine.py +4 -2
  201. transformers/models/canine/modeling_canine.py +73 -75
  202. transformers/models/canine/tokenization_canine.py +0 -1
  203. transformers/models/chameleon/configuration_chameleon.py +29 -34
  204. transformers/models/chameleon/image_processing_chameleon.py +21 -24
  205. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -6
  206. transformers/models/chameleon/modeling_chameleon.py +135 -92
  207. transformers/models/chameleon/processing_chameleon.py +16 -41
  208. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -8
  209. transformers/models/chinese_clip/image_processing_chinese_clip.py +21 -24
  210. transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +0 -1
  211. transformers/models/chinese_clip/modeling_chinese_clip.py +93 -95
  212. transformers/models/chinese_clip/processing_chinese_clip.py +2 -15
  213. transformers/models/clap/configuration_clap.py +4 -9
  214. transformers/models/clap/feature_extraction_clap.py +9 -10
  215. transformers/models/clap/modeling_clap.py +109 -111
  216. transformers/models/clap/processing_clap.py +2 -15
  217. transformers/models/clip/configuration_clip.py +4 -2
  218. transformers/models/clip/image_processing_clip.py +21 -24
  219. transformers/models/clip/image_processing_clip_fast.py +9 -1
  220. transformers/models/clip/modeling_clip.py +70 -68
  221. transformers/models/clip/processing_clip.py +2 -14
  222. transformers/models/clip/tokenization_clip.py +2 -5
  223. transformers/models/clipseg/configuration_clipseg.py +4 -2
  224. transformers/models/clipseg/modeling_clipseg.py +113 -112
  225. transformers/models/clipseg/processing_clipseg.py +19 -42
  226. transformers/models/clvp/configuration_clvp.py +15 -5
  227. transformers/models/clvp/feature_extraction_clvp.py +7 -10
  228. transformers/models/clvp/modeling_clvp.py +138 -145
  229. transformers/models/clvp/number_normalizer.py +1 -2
  230. transformers/models/clvp/processing_clvp.py +3 -20
  231. transformers/models/clvp/tokenization_clvp.py +0 -1
  232. transformers/models/code_llama/tokenization_code_llama.py +3 -6
  233. transformers/models/codegen/configuration_codegen.py +4 -4
  234. transformers/models/codegen/modeling_codegen.py +50 -49
  235. transformers/models/codegen/tokenization_codegen.py +5 -6
  236. transformers/models/cohere/configuration_cohere.py +25 -30
  237. transformers/models/cohere/modeling_cohere.py +39 -42
  238. transformers/models/cohere/modular_cohere.py +27 -31
  239. transformers/models/cohere/tokenization_cohere.py +5 -6
  240. transformers/models/cohere2/configuration_cohere2.py +27 -32
  241. transformers/models/cohere2/modeling_cohere2.py +38 -41
  242. transformers/models/cohere2/modular_cohere2.py +48 -52
  243. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  244. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +9 -10
  245. transformers/models/cohere2_vision/modeling_cohere2_vision.py +52 -55
  246. transformers/models/cohere2_vision/modular_cohere2_vision.py +41 -43
  247. transformers/models/cohere2_vision/processing_cohere2_vision.py +6 -36
  248. transformers/models/colpali/configuration_colpali.py +0 -1
  249. transformers/models/colpali/modeling_colpali.py +14 -16
  250. transformers/models/colpali/modular_colpali.py +11 -51
  251. transformers/models/colpali/processing_colpali.py +14 -52
  252. transformers/models/colqwen2/modeling_colqwen2.py +27 -28
  253. transformers/models/colqwen2/modular_colqwen2.py +36 -74
  254. transformers/models/colqwen2/processing_colqwen2.py +16 -52
  255. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -47
  256. transformers/models/conditional_detr/image_processing_conditional_detr.py +67 -70
  257. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +50 -36
  258. transformers/models/conditional_detr/modeling_conditional_detr.py +851 -1001
  259. transformers/models/conditional_detr/modular_conditional_detr.py +901 -5
  260. transformers/models/convbert/configuration_convbert.py +11 -8
  261. transformers/models/convbert/modeling_convbert.py +85 -87
  262. transformers/models/convbert/tokenization_convbert.py +0 -1
  263. transformers/models/convnext/configuration_convnext.py +2 -5
  264. transformers/models/convnext/image_processing_convnext.py +18 -21
  265. transformers/models/convnext/image_processing_convnext_fast.py +7 -8
  266. transformers/models/convnext/modeling_convnext.py +12 -14
  267. transformers/models/convnextv2/configuration_convnextv2.py +2 -5
  268. transformers/models/convnextv2/modeling_convnextv2.py +12 -14
  269. transformers/models/cpm/tokenization_cpm.py +6 -7
  270. transformers/models/cpm/tokenization_cpm_fast.py +3 -5
  271. transformers/models/cpmant/configuration_cpmant.py +4 -1
  272. transformers/models/cpmant/modeling_cpmant.py +38 -40
  273. transformers/models/cpmant/tokenization_cpmant.py +1 -3
  274. transformers/models/csm/configuration_csm.py +58 -66
  275. transformers/models/csm/generation_csm.py +13 -14
  276. transformers/models/csm/modeling_csm.py +81 -84
  277. transformers/models/csm/modular_csm.py +56 -58
  278. transformers/models/csm/processing_csm.py +25 -68
  279. transformers/models/ctrl/configuration_ctrl.py +16 -1
  280. transformers/models/ctrl/modeling_ctrl.py +51 -66
  281. transformers/models/ctrl/tokenization_ctrl.py +0 -1
  282. transformers/models/cvt/configuration_cvt.py +0 -1
  283. transformers/models/cvt/modeling_cvt.py +13 -15
  284. transformers/models/cwm/__init__.py +0 -1
  285. transformers/models/cwm/configuration_cwm.py +8 -12
  286. transformers/models/cwm/modeling_cwm.py +36 -38
  287. transformers/models/cwm/modular_cwm.py +10 -12
  288. transformers/models/d_fine/configuration_d_fine.py +10 -57
  289. transformers/models/d_fine/modeling_d_fine.py +786 -927
  290. transformers/models/d_fine/modular_d_fine.py +339 -417
  291. transformers/models/dab_detr/configuration_dab_detr.py +22 -49
  292. transformers/models/dab_detr/modeling_dab_detr.py +79 -77
  293. transformers/models/dac/configuration_dac.py +0 -1
  294. transformers/models/dac/feature_extraction_dac.py +6 -9
  295. transformers/models/dac/modeling_dac.py +22 -24
  296. transformers/models/data2vec/configuration_data2vec_audio.py +4 -2
  297. transformers/models/data2vec/configuration_data2vec_text.py +11 -3
  298. transformers/models/data2vec/configuration_data2vec_vision.py +0 -1
  299. transformers/models/data2vec/modeling_data2vec_audio.py +55 -59
  300. transformers/models/data2vec/modeling_data2vec_text.py +97 -99
  301. transformers/models/data2vec/modeling_data2vec_vision.py +45 -44
  302. transformers/models/data2vec/modular_data2vec_audio.py +6 -1
  303. transformers/models/data2vec/modular_data2vec_text.py +51 -54
  304. transformers/models/dbrx/configuration_dbrx.py +29 -22
  305. transformers/models/dbrx/modeling_dbrx.py +45 -48
  306. transformers/models/dbrx/modular_dbrx.py +37 -39
  307. transformers/models/deberta/configuration_deberta.py +6 -1
  308. transformers/models/deberta/modeling_deberta.py +57 -60
  309. transformers/models/deberta/tokenization_deberta.py +2 -5
  310. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -1
  311. transformers/models/deberta_v2/modeling_deberta_v2.py +63 -65
  312. transformers/models/deberta_v2/tokenization_deberta_v2.py +1 -4
  313. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -2
  314. transformers/models/decision_transformer/modeling_decision_transformer.py +51 -53
  315. transformers/models/deepseek_v2/configuration_deepseek_v2.py +41 -47
  316. transformers/models/deepseek_v2/modeling_deepseek_v2.py +39 -41
  317. transformers/models/deepseek_v2/modular_deepseek_v2.py +48 -52
  318. transformers/models/deepseek_v3/configuration_deepseek_v3.py +42 -48
  319. transformers/models/deepseek_v3/modeling_deepseek_v3.py +38 -40
  320. transformers/models/deepseek_v3/modular_deepseek_v3.py +10 -10
  321. transformers/models/deepseek_vl/configuration_deepseek_vl.py +6 -3
  322. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +27 -28
  323. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +12 -11
  324. transformers/models/deepseek_vl/modeling_deepseek_vl.py +48 -43
  325. transformers/models/deepseek_vl/modular_deepseek_vl.py +15 -43
  326. transformers/models/deepseek_vl/processing_deepseek_vl.py +10 -41
  327. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +7 -5
  328. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +37 -37
  329. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +22 -22
  330. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +100 -56
  331. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +141 -109
  332. transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +12 -44
  333. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -46
  334. transformers/models/deformable_detr/image_processing_deformable_detr.py +59 -61
  335. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +42 -28
  336. transformers/models/deformable_detr/modeling_deformable_detr.py +454 -652
  337. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -5
  338. transformers/models/deit/configuration_deit.py +0 -1
  339. transformers/models/deit/image_processing_deit.py +18 -21
  340. transformers/models/deit/image_processing_deit_fast.py +0 -1
  341. transformers/models/deit/modeling_deit.py +27 -25
  342. transformers/models/depth_anything/configuration_depth_anything.py +12 -43
  343. transformers/models/depth_anything/modeling_depth_anything.py +10 -11
  344. transformers/models/depth_pro/configuration_depth_pro.py +0 -1
  345. transformers/models/depth_pro/image_processing_depth_pro.py +22 -23
  346. transformers/models/depth_pro/image_processing_depth_pro_fast.py +8 -9
  347. transformers/models/depth_pro/modeling_depth_pro.py +29 -27
  348. transformers/models/detr/configuration_detr.py +18 -50
  349. transformers/models/detr/image_processing_detr.py +64 -66
  350. transformers/models/detr/image_processing_detr_fast.py +33 -34
  351. transformers/models/detr/modeling_detr.py +748 -789
  352. transformers/models/dia/configuration_dia.py +9 -15
  353. transformers/models/dia/feature_extraction_dia.py +6 -9
  354. transformers/models/dia/generation_dia.py +48 -53
  355. transformers/models/dia/modeling_dia.py +68 -71
  356. transformers/models/dia/modular_dia.py +56 -58
  357. transformers/models/dia/processing_dia.py +39 -29
  358. transformers/models/dia/tokenization_dia.py +3 -6
  359. transformers/models/diffllama/configuration_diffllama.py +25 -30
  360. transformers/models/diffllama/modeling_diffllama.py +45 -53
  361. transformers/models/diffllama/modular_diffllama.py +18 -25
  362. transformers/models/dinat/configuration_dinat.py +2 -5
  363. transformers/models/dinat/modeling_dinat.py +47 -48
  364. transformers/models/dinov2/configuration_dinov2.py +2 -5
  365. transformers/models/dinov2/modeling_dinov2.py +20 -21
  366. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +3 -5
  367. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +21 -21
  368. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +11 -14
  369. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +6 -11
  370. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +5 -9
  371. transformers/models/dinov3_vit/configuration_dinov3_vit.py +7 -12
  372. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +7 -8
  373. transformers/models/dinov3_vit/modeling_dinov3_vit.py +19 -22
  374. transformers/models/dinov3_vit/modular_dinov3_vit.py +16 -19
  375. transformers/models/distilbert/configuration_distilbert.py +8 -2
  376. transformers/models/distilbert/modeling_distilbert.py +47 -49
  377. transformers/models/distilbert/tokenization_distilbert.py +0 -1
  378. transformers/models/doge/__init__.py +0 -1
  379. transformers/models/doge/configuration_doge.py +42 -35
  380. transformers/models/doge/modeling_doge.py +46 -49
  381. transformers/models/doge/modular_doge.py +77 -68
  382. transformers/models/donut/configuration_donut_swin.py +0 -1
  383. transformers/models/donut/image_processing_donut.py +26 -29
  384. transformers/models/donut/image_processing_donut_fast.py +9 -14
  385. transformers/models/donut/modeling_donut_swin.py +44 -46
  386. transformers/models/donut/processing_donut.py +5 -26
  387. transformers/models/dots1/configuration_dots1.py +43 -36
  388. transformers/models/dots1/modeling_dots1.py +35 -38
  389. transformers/models/dots1/modular_dots1.py +0 -1
  390. transformers/models/dpr/configuration_dpr.py +19 -2
  391. transformers/models/dpr/modeling_dpr.py +37 -39
  392. transformers/models/dpr/tokenization_dpr.py +7 -9
  393. transformers/models/dpr/tokenization_dpr_fast.py +7 -9
  394. transformers/models/dpt/configuration_dpt.py +23 -66
  395. transformers/models/dpt/image_processing_dpt.py +65 -66
  396. transformers/models/dpt/image_processing_dpt_fast.py +18 -19
  397. transformers/models/dpt/modeling_dpt.py +38 -36
  398. transformers/models/dpt/modular_dpt.py +14 -15
  399. transformers/models/edgetam/configuration_edgetam.py +1 -2
  400. transformers/models/edgetam/modeling_edgetam.py +87 -89
  401. transformers/models/edgetam/modular_edgetam.py +7 -13
  402. transformers/models/edgetam_video/__init__.py +0 -1
  403. transformers/models/edgetam_video/configuration_edgetam_video.py +0 -1
  404. transformers/models/edgetam_video/modeling_edgetam_video.py +126 -128
  405. transformers/models/edgetam_video/modular_edgetam_video.py +25 -27
  406. transformers/models/efficientloftr/configuration_efficientloftr.py +4 -5
  407. transformers/models/efficientloftr/image_processing_efficientloftr.py +14 -16
  408. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +8 -7
  409. transformers/models/efficientloftr/modeling_efficientloftr.py +46 -38
  410. transformers/models/efficientloftr/modular_efficientloftr.py +1 -3
  411. transformers/models/efficientnet/configuration_efficientnet.py +0 -1
  412. transformers/models/efficientnet/image_processing_efficientnet.py +23 -26
  413. transformers/models/efficientnet/image_processing_efficientnet_fast.py +16 -17
  414. transformers/models/efficientnet/modeling_efficientnet.py +12 -14
  415. transformers/models/electra/configuration_electra.py +13 -3
  416. transformers/models/electra/modeling_electra.py +107 -109
  417. transformers/models/emu3/configuration_emu3.py +17 -17
  418. transformers/models/emu3/image_processing_emu3.py +44 -39
  419. transformers/models/emu3/modeling_emu3.py +143 -109
  420. transformers/models/emu3/modular_emu3.py +109 -73
  421. transformers/models/emu3/processing_emu3.py +18 -43
  422. transformers/models/encodec/configuration_encodec.py +2 -4
  423. transformers/models/encodec/feature_extraction_encodec.py +10 -13
  424. transformers/models/encodec/modeling_encodec.py +25 -29
  425. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -2
  426. transformers/models/encoder_decoder/modeling_encoder_decoder.py +37 -43
  427. transformers/models/eomt/configuration_eomt.py +12 -14
  428. transformers/models/eomt/image_processing_eomt.py +53 -55
  429. transformers/models/eomt/image_processing_eomt_fast.py +18 -19
  430. transformers/models/eomt/modeling_eomt.py +19 -21
  431. transformers/models/eomt/modular_eomt.py +28 -30
  432. transformers/models/eomt_dinov3/__init__.py +28 -0
  433. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  434. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  435. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  436. transformers/models/ernie/configuration_ernie.py +24 -3
  437. transformers/models/ernie/modeling_ernie.py +127 -162
  438. transformers/models/ernie/modular_ernie.py +91 -103
  439. transformers/models/ernie4_5/configuration_ernie4_5.py +23 -27
  440. transformers/models/ernie4_5/modeling_ernie4_5.py +35 -37
  441. transformers/models/ernie4_5/modular_ernie4_5.py +1 -3
  442. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +34 -39
  443. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +40 -42
  444. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +7 -9
  445. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -7
  446. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +34 -35
  447. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +6 -7
  448. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +305 -267
  449. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +163 -142
  450. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +3 -5
  451. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +17 -18
  452. transformers/models/esm/configuration_esm.py +11 -15
  453. transformers/models/esm/modeling_esm.py +35 -37
  454. transformers/models/esm/modeling_esmfold.py +43 -50
  455. transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
  456. transformers/models/esm/openfold_utils/loss.py +1 -2
  457. transformers/models/esm/openfold_utils/protein.py +15 -16
  458. transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
  459. transformers/models/esm/tokenization_esm.py +2 -4
  460. transformers/models/evolla/configuration_evolla.py +50 -40
  461. transformers/models/evolla/modeling_evolla.py +69 -68
  462. transformers/models/evolla/modular_evolla.py +50 -48
  463. transformers/models/evolla/processing_evolla.py +23 -35
  464. transformers/models/exaone4/configuration_exaone4.py +27 -27
  465. transformers/models/exaone4/modeling_exaone4.py +36 -39
  466. transformers/models/exaone4/modular_exaone4.py +51 -50
  467. transformers/models/exaone_moe/__init__.py +27 -0
  468. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  469. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  470. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  471. transformers/models/falcon/configuration_falcon.py +31 -26
  472. transformers/models/falcon/modeling_falcon.py +76 -84
  473. transformers/models/falcon_h1/configuration_falcon_h1.py +57 -51
  474. transformers/models/falcon_h1/modeling_falcon_h1.py +74 -109
  475. transformers/models/falcon_h1/modular_falcon_h1.py +68 -100
  476. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -2
  477. transformers/models/falcon_mamba/modeling_falcon_mamba.py +64 -73
  478. transformers/models/falcon_mamba/modular_falcon_mamba.py +14 -13
  479. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -0
  480. transformers/models/fast_vlm/modeling_fast_vlm.py +70 -97
  481. transformers/models/fast_vlm/modular_fast_vlm.py +148 -38
  482. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +2 -6
  483. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +45 -47
  484. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -3
  485. transformers/models/flaubert/configuration_flaubert.py +10 -5
  486. transformers/models/flaubert/modeling_flaubert.py +125 -129
  487. transformers/models/flaubert/tokenization_flaubert.py +3 -5
  488. transformers/models/flava/configuration_flava.py +9 -9
  489. transformers/models/flava/image_processing_flava.py +66 -67
  490. transformers/models/flava/image_processing_flava_fast.py +46 -47
  491. transformers/models/flava/modeling_flava.py +144 -135
  492. transformers/models/flava/processing_flava.py +2 -12
  493. transformers/models/flex_olmo/__init__.py +0 -1
  494. transformers/models/flex_olmo/configuration_flex_olmo.py +34 -39
  495. transformers/models/flex_olmo/modeling_flex_olmo.py +41 -43
  496. transformers/models/flex_olmo/modular_flex_olmo.py +46 -51
  497. transformers/models/florence2/configuration_florence2.py +4 -1
  498. transformers/models/florence2/modeling_florence2.py +96 -72
  499. transformers/models/florence2/modular_florence2.py +100 -107
  500. transformers/models/florence2/processing_florence2.py +18 -47
  501. transformers/models/fnet/configuration_fnet.py +6 -2
  502. transformers/models/fnet/modeling_fnet.py +69 -80
  503. transformers/models/fnet/tokenization_fnet.py +0 -1
  504. transformers/models/focalnet/configuration_focalnet.py +2 -5
  505. transformers/models/focalnet/modeling_focalnet.py +49 -48
  506. transformers/models/fsmt/configuration_fsmt.py +12 -17
  507. transformers/models/fsmt/modeling_fsmt.py +47 -48
  508. transformers/models/fsmt/tokenization_fsmt.py +3 -5
  509. transformers/models/funnel/configuration_funnel.py +8 -1
  510. transformers/models/funnel/modeling_funnel.py +91 -93
  511. transformers/models/funnel/tokenization_funnel.py +2 -5
  512. transformers/models/fuyu/configuration_fuyu.py +28 -34
  513. transformers/models/fuyu/image_processing_fuyu.py +29 -31
  514. transformers/models/fuyu/image_processing_fuyu_fast.py +17 -17
  515. transformers/models/fuyu/modeling_fuyu.py +50 -52
  516. transformers/models/fuyu/processing_fuyu.py +9 -36
  517. transformers/models/gemma/configuration_gemma.py +25 -30
  518. transformers/models/gemma/modeling_gemma.py +36 -38
  519. transformers/models/gemma/modular_gemma.py +33 -36
  520. transformers/models/gemma/tokenization_gemma.py +3 -6
  521. transformers/models/gemma2/configuration_gemma2.py +30 -35
  522. transformers/models/gemma2/modeling_gemma2.py +38 -41
  523. transformers/models/gemma2/modular_gemma2.py +63 -67
  524. transformers/models/gemma3/configuration_gemma3.py +53 -48
  525. transformers/models/gemma3/image_processing_gemma3.py +29 -31
  526. transformers/models/gemma3/image_processing_gemma3_fast.py +11 -12
  527. transformers/models/gemma3/modeling_gemma3.py +123 -122
  528. transformers/models/gemma3/modular_gemma3.py +128 -125
  529. transformers/models/gemma3/processing_gemma3.py +5 -5
  530. transformers/models/gemma3n/configuration_gemma3n.py +42 -30
  531. transformers/models/gemma3n/feature_extraction_gemma3n.py +9 -11
  532. transformers/models/gemma3n/modeling_gemma3n.py +166 -147
  533. transformers/models/gemma3n/modular_gemma3n.py +176 -148
  534. transformers/models/gemma3n/processing_gemma3n.py +12 -26
  535. transformers/models/git/configuration_git.py +5 -8
  536. transformers/models/git/modeling_git.py +115 -127
  537. transformers/models/git/processing_git.py +2 -14
  538. transformers/models/glm/configuration_glm.py +26 -30
  539. transformers/models/glm/modeling_glm.py +36 -39
  540. transformers/models/glm/modular_glm.py +4 -7
  541. transformers/models/glm4/configuration_glm4.py +26 -30
  542. transformers/models/glm4/modeling_glm4.py +39 -41
  543. transformers/models/glm4/modular_glm4.py +8 -10
  544. transformers/models/glm46v/configuration_glm46v.py +4 -1
  545. transformers/models/glm46v/image_processing_glm46v.py +40 -38
  546. transformers/models/glm46v/image_processing_glm46v_fast.py +9 -9
  547. transformers/models/glm46v/modeling_glm46v.py +138 -93
  548. transformers/models/glm46v/modular_glm46v.py +5 -3
  549. transformers/models/glm46v/processing_glm46v.py +7 -41
  550. transformers/models/glm46v/video_processing_glm46v.py +9 -11
  551. transformers/models/glm4_moe/configuration_glm4_moe.py +42 -35
  552. transformers/models/glm4_moe/modeling_glm4_moe.py +36 -39
  553. transformers/models/glm4_moe/modular_glm4_moe.py +43 -36
  554. transformers/models/glm4_moe_lite/__init__.py +28 -0
  555. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +233 -0
  556. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +740 -0
  557. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +302 -0
  558. transformers/models/glm4v/configuration_glm4v.py +25 -24
  559. transformers/models/glm4v/image_processing_glm4v.py +39 -38
  560. transformers/models/glm4v/image_processing_glm4v_fast.py +8 -9
  561. transformers/models/glm4v/modeling_glm4v.py +249 -210
  562. transformers/models/glm4v/modular_glm4v.py +211 -230
  563. transformers/models/glm4v/processing_glm4v.py +7 -41
  564. transformers/models/glm4v/video_processing_glm4v.py +9 -11
  565. transformers/models/glm4v_moe/configuration_glm4v_moe.py +136 -127
  566. transformers/models/glm4v_moe/modeling_glm4v_moe.py +348 -356
  567. transformers/models/glm4v_moe/modular_glm4v_moe.py +76 -174
  568. transformers/models/glm_image/__init__.py +31 -0
  569. transformers/models/glm_image/configuration_glm_image.py +358 -0
  570. transformers/models/glm_image/image_processing_glm_image.py +503 -0
  571. transformers/models/glm_image/image_processing_glm_image_fast.py +294 -0
  572. transformers/models/glm_image/modeling_glm_image.py +1691 -0
  573. transformers/models/glm_image/modular_glm_image.py +1640 -0
  574. transformers/models/glm_image/processing_glm_image.py +265 -0
  575. transformers/models/glm_ocr/__init__.py +28 -0
  576. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  577. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  578. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  579. transformers/models/glmasr/__init__.py +0 -1
  580. transformers/models/glmasr/configuration_glmasr.py +0 -1
  581. transformers/models/glmasr/modeling_glmasr.py +51 -46
  582. transformers/models/glmasr/modular_glmasr.py +39 -29
  583. transformers/models/glmasr/processing_glmasr.py +7 -8
  584. transformers/models/glpn/configuration_glpn.py +0 -1
  585. transformers/models/glpn/image_processing_glpn.py +11 -12
  586. transformers/models/glpn/image_processing_glpn_fast.py +11 -12
  587. transformers/models/glpn/modeling_glpn.py +14 -14
  588. transformers/models/got_ocr2/configuration_got_ocr2.py +10 -13
  589. transformers/models/got_ocr2/image_processing_got_ocr2.py +22 -24
  590. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +9 -10
  591. transformers/models/got_ocr2/modeling_got_ocr2.py +69 -77
  592. transformers/models/got_ocr2/modular_got_ocr2.py +60 -52
  593. transformers/models/got_ocr2/processing_got_ocr2.py +42 -63
  594. transformers/models/gpt2/configuration_gpt2.py +13 -2
  595. transformers/models/gpt2/modeling_gpt2.py +111 -113
  596. transformers/models/gpt2/tokenization_gpt2.py +6 -9
  597. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -2
  598. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +78 -84
  599. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -2
  600. transformers/models/gpt_neo/modeling_gpt_neo.py +66 -71
  601. transformers/models/gpt_neox/configuration_gpt_neox.py +27 -25
  602. transformers/models/gpt_neox/modeling_gpt_neox.py +74 -76
  603. transformers/models/gpt_neox/modular_gpt_neox.py +68 -70
  604. transformers/models/gpt_neox/tokenization_gpt_neox.py +2 -5
  605. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +24 -19
  606. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +43 -46
  607. transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +1 -3
  608. transformers/models/gpt_oss/configuration_gpt_oss.py +31 -30
  609. transformers/models/gpt_oss/modeling_gpt_oss.py +80 -114
  610. transformers/models/gpt_oss/modular_gpt_oss.py +62 -97
  611. transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
  612. transformers/models/gptj/configuration_gptj.py +4 -5
  613. transformers/models/gptj/modeling_gptj.py +85 -88
  614. transformers/models/granite/configuration_granite.py +28 -33
  615. transformers/models/granite/modeling_granite.py +43 -45
  616. transformers/models/granite/modular_granite.py +29 -31
  617. transformers/models/granite_speech/configuration_granite_speech.py +0 -1
  618. transformers/models/granite_speech/feature_extraction_granite_speech.py +1 -3
  619. transformers/models/granite_speech/modeling_granite_speech.py +84 -60
  620. transformers/models/granite_speech/processing_granite_speech.py +11 -4
  621. transformers/models/granitemoe/configuration_granitemoe.py +31 -36
  622. transformers/models/granitemoe/modeling_granitemoe.py +39 -41
  623. transformers/models/granitemoe/modular_granitemoe.py +21 -23
  624. transformers/models/granitemoehybrid/__init__.py +0 -1
  625. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +55 -48
  626. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +82 -118
  627. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +57 -65
  628. transformers/models/granitemoeshared/configuration_granitemoeshared.py +33 -37
  629. transformers/models/granitemoeshared/modeling_granitemoeshared.py +52 -56
  630. transformers/models/granitemoeshared/modular_granitemoeshared.py +19 -21
  631. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -46
  632. transformers/models/grounding_dino/image_processing_grounding_dino.py +60 -62
  633. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +28 -29
  634. transformers/models/grounding_dino/modeling_grounding_dino.py +161 -181
  635. transformers/models/grounding_dino/modular_grounding_dino.py +2 -3
  636. transformers/models/grounding_dino/processing_grounding_dino.py +10 -38
  637. transformers/models/groupvit/configuration_groupvit.py +4 -2
  638. transformers/models/groupvit/modeling_groupvit.py +98 -92
  639. transformers/models/helium/configuration_helium.py +25 -29
  640. transformers/models/helium/modeling_helium.py +37 -40
  641. transformers/models/helium/modular_helium.py +3 -7
  642. transformers/models/herbert/tokenization_herbert.py +4 -6
  643. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -5
  644. transformers/models/hgnet_v2/modeling_hgnet_v2.py +12 -14
  645. transformers/models/hgnet_v2/modular_hgnet_v2.py +13 -17
  646. transformers/models/hiera/configuration_hiera.py +2 -5
  647. transformers/models/hiera/modeling_hiera.py +71 -70
  648. transformers/models/hubert/configuration_hubert.py +4 -2
  649. transformers/models/hubert/modeling_hubert.py +42 -41
  650. transformers/models/hubert/modular_hubert.py +8 -11
  651. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +26 -31
  652. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +58 -37
  653. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +31 -11
  654. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +31 -36
  655. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +54 -44
  656. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +27 -15
  657. transformers/models/ibert/configuration_ibert.py +4 -2
  658. transformers/models/ibert/modeling_ibert.py +60 -62
  659. transformers/models/ibert/quant_modules.py +0 -1
  660. transformers/models/idefics/configuration_idefics.py +5 -8
  661. transformers/models/idefics/image_processing_idefics.py +13 -15
  662. transformers/models/idefics/modeling_idefics.py +63 -65
  663. transformers/models/idefics/perceiver.py +1 -3
  664. transformers/models/idefics/processing_idefics.py +32 -48
  665. transformers/models/idefics/vision.py +27 -28
  666. transformers/models/idefics2/configuration_idefics2.py +1 -3
  667. transformers/models/idefics2/image_processing_idefics2.py +31 -32
  668. transformers/models/idefics2/image_processing_idefics2_fast.py +8 -8
  669. transformers/models/idefics2/modeling_idefics2.py +126 -106
  670. transformers/models/idefics2/processing_idefics2.py +10 -68
  671. transformers/models/idefics3/configuration_idefics3.py +1 -4
  672. transformers/models/idefics3/image_processing_idefics3.py +42 -43
  673. transformers/models/idefics3/image_processing_idefics3_fast.py +40 -15
  674. transformers/models/idefics3/modeling_idefics3.py +113 -92
  675. transformers/models/idefics3/processing_idefics3.py +15 -69
  676. transformers/models/ijepa/configuration_ijepa.py +0 -1
  677. transformers/models/ijepa/modeling_ijepa.py +13 -14
  678. transformers/models/ijepa/modular_ijepa.py +5 -7
  679. transformers/models/imagegpt/configuration_imagegpt.py +9 -2
  680. transformers/models/imagegpt/image_processing_imagegpt.py +17 -18
  681. transformers/models/imagegpt/image_processing_imagegpt_fast.py +10 -11
  682. transformers/models/imagegpt/modeling_imagegpt.py +65 -62
  683. transformers/models/informer/configuration_informer.py +6 -9
  684. transformers/models/informer/modeling_informer.py +87 -89
  685. transformers/models/informer/modular_informer.py +13 -16
  686. transformers/models/instructblip/configuration_instructblip.py +2 -2
  687. transformers/models/instructblip/modeling_instructblip.py +104 -79
  688. transformers/models/instructblip/processing_instructblip.py +10 -36
  689. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -2
  690. transformers/models/instructblipvideo/modeling_instructblipvideo.py +108 -105
  691. transformers/models/instructblipvideo/modular_instructblipvideo.py +73 -64
  692. transformers/models/instructblipvideo/processing_instructblipvideo.py +14 -33
  693. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +6 -7
  694. transformers/models/internvl/configuration_internvl.py +5 -1
  695. transformers/models/internvl/modeling_internvl.py +76 -98
  696. transformers/models/internvl/modular_internvl.py +45 -59
  697. transformers/models/internvl/processing_internvl.py +12 -45
  698. transformers/models/internvl/video_processing_internvl.py +10 -11
  699. transformers/models/jais2/configuration_jais2.py +25 -29
  700. transformers/models/jais2/modeling_jais2.py +36 -38
  701. transformers/models/jais2/modular_jais2.py +20 -22
  702. transformers/models/jamba/configuration_jamba.py +5 -8
  703. transformers/models/jamba/modeling_jamba.py +47 -50
  704. transformers/models/jamba/modular_jamba.py +40 -41
  705. transformers/models/janus/configuration_janus.py +0 -1
  706. transformers/models/janus/image_processing_janus.py +37 -39
  707. transformers/models/janus/image_processing_janus_fast.py +20 -21
  708. transformers/models/janus/modeling_janus.py +103 -188
  709. transformers/models/janus/modular_janus.py +122 -83
  710. transformers/models/janus/processing_janus.py +17 -43
  711. transformers/models/jetmoe/configuration_jetmoe.py +26 -27
  712. transformers/models/jetmoe/modeling_jetmoe.py +42 -45
  713. transformers/models/jetmoe/modular_jetmoe.py +33 -36
  714. transformers/models/kosmos2/configuration_kosmos2.py +10 -9
  715. transformers/models/kosmos2/modeling_kosmos2.py +199 -178
  716. transformers/models/kosmos2/processing_kosmos2.py +40 -55
  717. transformers/models/kosmos2_5/__init__.py +0 -1
  718. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -9
  719. transformers/models/kosmos2_5/image_processing_kosmos2_5.py +10 -12
  720. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -11
  721. transformers/models/kosmos2_5/modeling_kosmos2_5.py +162 -172
  722. transformers/models/kosmos2_5/processing_kosmos2_5.py +8 -29
  723. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +31 -28
  724. transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +12 -14
  725. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +103 -106
  726. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +20 -22
  727. transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +2 -8
  728. transformers/models/lasr/configuration_lasr.py +3 -7
  729. transformers/models/lasr/feature_extraction_lasr.py +10 -12
  730. transformers/models/lasr/modeling_lasr.py +21 -24
  731. transformers/models/lasr/modular_lasr.py +11 -13
  732. transformers/models/lasr/processing_lasr.py +12 -6
  733. transformers/models/lasr/tokenization_lasr.py +2 -4
  734. transformers/models/layoutlm/configuration_layoutlm.py +14 -2
  735. transformers/models/layoutlm/modeling_layoutlm.py +70 -72
  736. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -17
  737. transformers/models/layoutlmv2/image_processing_layoutlmv2.py +18 -21
  738. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +7 -8
  739. transformers/models/layoutlmv2/modeling_layoutlmv2.py +48 -50
  740. transformers/models/layoutlmv2/processing_layoutlmv2.py +14 -44
  741. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +63 -74
  742. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -19
  743. transformers/models/layoutlmv3/image_processing_layoutlmv3.py +24 -26
  744. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +9 -10
  745. transformers/models/layoutlmv3/modeling_layoutlmv3.py +49 -51
  746. transformers/models/layoutlmv3/processing_layoutlmv3.py +14 -46
  747. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +64 -75
  748. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -17
  749. transformers/models/layoutxlm/modular_layoutxlm.py +0 -1
  750. transformers/models/layoutxlm/processing_layoutxlm.py +14 -44
  751. transformers/models/layoutxlm/tokenization_layoutxlm.py +65 -76
  752. transformers/models/led/configuration_led.py +8 -12
  753. transformers/models/led/modeling_led.py +113 -267
  754. transformers/models/levit/configuration_levit.py +0 -1
  755. transformers/models/levit/image_processing_levit.py +19 -21
  756. transformers/models/levit/image_processing_levit_fast.py +4 -5
  757. transformers/models/levit/modeling_levit.py +17 -19
  758. transformers/models/lfm2/configuration_lfm2.py +27 -30
  759. transformers/models/lfm2/modeling_lfm2.py +46 -48
  760. transformers/models/lfm2/modular_lfm2.py +32 -32
  761. transformers/models/lfm2_moe/__init__.py +0 -1
  762. transformers/models/lfm2_moe/configuration_lfm2_moe.py +6 -9
  763. transformers/models/lfm2_moe/modeling_lfm2_moe.py +48 -49
  764. transformers/models/lfm2_moe/modular_lfm2_moe.py +8 -9
  765. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -1
  766. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +43 -20
  767. transformers/models/lfm2_vl/modeling_lfm2_vl.py +73 -61
  768. transformers/models/lfm2_vl/modular_lfm2_vl.py +66 -54
  769. transformers/models/lfm2_vl/processing_lfm2_vl.py +14 -34
  770. transformers/models/lightglue/image_processing_lightglue.py +16 -15
  771. transformers/models/lightglue/image_processing_lightglue_fast.py +8 -7
  772. transformers/models/lightglue/modeling_lightglue.py +31 -33
  773. transformers/models/lightglue/modular_lightglue.py +31 -31
  774. transformers/models/lighton_ocr/__init__.py +28 -0
  775. transformers/models/lighton_ocr/configuration_lighton_ocr.py +128 -0
  776. transformers/models/lighton_ocr/modeling_lighton_ocr.py +463 -0
  777. transformers/models/lighton_ocr/modular_lighton_ocr.py +404 -0
  778. transformers/models/lighton_ocr/processing_lighton_ocr.py +229 -0
  779. transformers/models/lilt/configuration_lilt.py +6 -2
  780. transformers/models/lilt/modeling_lilt.py +53 -55
  781. transformers/models/llama/configuration_llama.py +26 -31
  782. transformers/models/llama/modeling_llama.py +35 -38
  783. transformers/models/llama/tokenization_llama.py +2 -4
  784. transformers/models/llama4/configuration_llama4.py +87 -69
  785. transformers/models/llama4/image_processing_llama4_fast.py +11 -12
  786. transformers/models/llama4/modeling_llama4.py +116 -115
  787. transformers/models/llama4/processing_llama4.py +33 -57
  788. transformers/models/llava/configuration_llava.py +10 -1
  789. transformers/models/llava/image_processing_llava.py +25 -28
  790. transformers/models/llava/image_processing_llava_fast.py +9 -10
  791. transformers/models/llava/modeling_llava.py +73 -102
  792. transformers/models/llava/processing_llava.py +18 -51
  793. transformers/models/llava_next/configuration_llava_next.py +2 -2
  794. transformers/models/llava_next/image_processing_llava_next.py +43 -45
  795. transformers/models/llava_next/image_processing_llava_next_fast.py +11 -12
  796. transformers/models/llava_next/modeling_llava_next.py +103 -104
  797. transformers/models/llava_next/processing_llava_next.py +18 -47
  798. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -7
  799. transformers/models/llava_next_video/modeling_llava_next_video.py +168 -155
  800. transformers/models/llava_next_video/modular_llava_next_video.py +154 -147
  801. transformers/models/llava_next_video/processing_llava_next_video.py +21 -63
  802. transformers/models/llava_next_video/video_processing_llava_next_video.py +0 -1
  803. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -7
  804. transformers/models/llava_onevision/image_processing_llava_onevision.py +40 -42
  805. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +14 -14
  806. transformers/models/llava_onevision/modeling_llava_onevision.py +170 -166
  807. transformers/models/llava_onevision/modular_llava_onevision.py +156 -152
  808. transformers/models/llava_onevision/processing_llava_onevision.py +21 -53
  809. transformers/models/llava_onevision/video_processing_llava_onevision.py +0 -1
  810. transformers/models/longcat_flash/__init__.py +0 -1
  811. transformers/models/longcat_flash/configuration_longcat_flash.py +39 -45
  812. transformers/models/longcat_flash/modeling_longcat_flash.py +37 -38
  813. transformers/models/longcat_flash/modular_longcat_flash.py +23 -24
  814. transformers/models/longformer/configuration_longformer.py +5 -5
  815. transformers/models/longformer/modeling_longformer.py +99 -101
  816. transformers/models/longt5/configuration_longt5.py +9 -7
  817. transformers/models/longt5/modeling_longt5.py +45 -45
  818. transformers/models/luke/configuration_luke.py +8 -2
  819. transformers/models/luke/modeling_luke.py +179 -181
  820. transformers/models/luke/tokenization_luke.py +99 -105
  821. transformers/{pipelines/deprecated → models/lw_detr}/__init__.py +14 -3
  822. transformers/models/lw_detr/configuration_lw_detr.py +362 -0
  823. transformers/models/lw_detr/modeling_lw_detr.py +1697 -0
  824. transformers/models/lw_detr/modular_lw_detr.py +1609 -0
  825. transformers/models/lxmert/configuration_lxmert.py +16 -1
  826. transformers/models/lxmert/modeling_lxmert.py +63 -74
  827. transformers/models/m2m_100/configuration_m2m_100.py +7 -9
  828. transformers/models/m2m_100/modeling_m2m_100.py +72 -74
  829. transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
  830. transformers/models/mamba/configuration_mamba.py +5 -3
  831. transformers/models/mamba/modeling_mamba.py +61 -70
  832. transformers/models/mamba2/configuration_mamba2.py +5 -8
  833. transformers/models/mamba2/modeling_mamba2.py +66 -79
  834. transformers/models/marian/configuration_marian.py +10 -5
  835. transformers/models/marian/modeling_marian.py +88 -90
  836. transformers/models/marian/tokenization_marian.py +6 -6
  837. transformers/models/markuplm/configuration_markuplm.py +4 -7
  838. transformers/models/markuplm/feature_extraction_markuplm.py +1 -2
  839. transformers/models/markuplm/modeling_markuplm.py +63 -65
  840. transformers/models/markuplm/processing_markuplm.py +31 -38
  841. transformers/models/markuplm/tokenization_markuplm.py +67 -77
  842. transformers/models/mask2former/configuration_mask2former.py +14 -52
  843. transformers/models/mask2former/image_processing_mask2former.py +84 -85
  844. transformers/models/mask2former/image_processing_mask2former_fast.py +36 -36
  845. transformers/models/mask2former/modeling_mask2former.py +108 -104
  846. transformers/models/mask2former/modular_mask2former.py +6 -8
  847. transformers/models/maskformer/configuration_maskformer.py +17 -51
  848. transformers/models/maskformer/configuration_maskformer_swin.py +2 -5
  849. transformers/models/maskformer/image_processing_maskformer.py +84 -85
  850. transformers/models/maskformer/image_processing_maskformer_fast.py +35 -36
  851. transformers/models/maskformer/modeling_maskformer.py +71 -67
  852. transformers/models/maskformer/modeling_maskformer_swin.py +20 -23
  853. transformers/models/mbart/configuration_mbart.py +9 -5
  854. transformers/models/mbart/modeling_mbart.py +120 -119
  855. transformers/models/mbart/tokenization_mbart.py +2 -4
  856. transformers/models/mbart50/tokenization_mbart50.py +3 -5
  857. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -3
  858. transformers/models/megatron_bert/modeling_megatron_bert.py +139 -165
  859. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  860. transformers/models/metaclip_2/modeling_metaclip_2.py +94 -87
  861. transformers/models/metaclip_2/modular_metaclip_2.py +59 -45
  862. transformers/models/mgp_str/configuration_mgp_str.py +0 -1
  863. transformers/models/mgp_str/modeling_mgp_str.py +18 -18
  864. transformers/models/mgp_str/processing_mgp_str.py +3 -20
  865. transformers/models/mgp_str/tokenization_mgp_str.py +1 -3
  866. transformers/models/mimi/configuration_mimi.py +42 -40
  867. transformers/models/mimi/modeling_mimi.py +116 -115
  868. transformers/models/minimax/__init__.py +0 -1
  869. transformers/models/minimax/configuration_minimax.py +40 -47
  870. transformers/models/minimax/modeling_minimax.py +46 -49
  871. transformers/models/minimax/modular_minimax.py +59 -65
  872. transformers/models/minimax_m2/__init__.py +28 -0
  873. transformers/models/minimax_m2/configuration_minimax_m2.py +188 -0
  874. transformers/models/minimax_m2/modeling_minimax_m2.py +704 -0
  875. transformers/models/minimax_m2/modular_minimax_m2.py +346 -0
  876. transformers/models/ministral/configuration_ministral.py +25 -29
  877. transformers/models/ministral/modeling_ministral.py +35 -37
  878. transformers/models/ministral/modular_ministral.py +32 -37
  879. transformers/models/ministral3/configuration_ministral3.py +23 -26
  880. transformers/models/ministral3/modeling_ministral3.py +35 -37
  881. transformers/models/ministral3/modular_ministral3.py +7 -8
  882. transformers/models/mistral/configuration_mistral.py +24 -29
  883. transformers/models/mistral/modeling_mistral.py +35 -37
  884. transformers/models/mistral/modular_mistral.py +14 -15
  885. transformers/models/mistral3/configuration_mistral3.py +4 -1
  886. transformers/models/mistral3/modeling_mistral3.py +79 -82
  887. transformers/models/mistral3/modular_mistral3.py +66 -67
  888. transformers/models/mixtral/configuration_mixtral.py +32 -38
  889. transformers/models/mixtral/modeling_mixtral.py +39 -42
  890. transformers/models/mixtral/modular_mixtral.py +26 -29
  891. transformers/models/mlcd/configuration_mlcd.py +0 -1
  892. transformers/models/mlcd/modeling_mlcd.py +17 -17
  893. transformers/models/mlcd/modular_mlcd.py +16 -16
  894. transformers/models/mllama/configuration_mllama.py +10 -15
  895. transformers/models/mllama/image_processing_mllama.py +23 -25
  896. transformers/models/mllama/image_processing_mllama_fast.py +11 -11
  897. transformers/models/mllama/modeling_mllama.py +100 -103
  898. transformers/models/mllama/processing_mllama.py +6 -55
  899. transformers/models/mluke/tokenization_mluke.py +97 -103
  900. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -46
  901. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +159 -179
  902. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -46
  903. transformers/models/mobilebert/configuration_mobilebert.py +4 -2
  904. transformers/models/mobilebert/modeling_mobilebert.py +78 -88
  905. transformers/models/mobilebert/tokenization_mobilebert.py +0 -1
  906. transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +0 -1
  907. transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +20 -23
  908. transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +0 -1
  909. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +13 -16
  910. transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +0 -1
  911. transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +48 -51
  912. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +14 -15
  913. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +21 -22
  914. transformers/models/mobilevit/configuration_mobilevit.py +0 -1
  915. transformers/models/mobilevit/image_processing_mobilevit.py +41 -44
  916. transformers/models/mobilevit/image_processing_mobilevit_fast.py +12 -13
  917. transformers/models/mobilevit/modeling_mobilevit.py +21 -21
  918. transformers/models/mobilevitv2/configuration_mobilevitv2.py +0 -1
  919. transformers/models/mobilevitv2/modeling_mobilevitv2.py +21 -22
  920. transformers/models/modernbert/configuration_modernbert.py +76 -51
  921. transformers/models/modernbert/modeling_modernbert.py +188 -943
  922. transformers/models/modernbert/modular_modernbert.py +255 -978
  923. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +50 -44
  924. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +54 -64
  925. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +92 -92
  926. transformers/models/moonshine/configuration_moonshine.py +34 -31
  927. transformers/models/moonshine/modeling_moonshine.py +70 -72
  928. transformers/models/moonshine/modular_moonshine.py +91 -86
  929. transformers/models/moshi/configuration_moshi.py +46 -23
  930. transformers/models/moshi/modeling_moshi.py +134 -142
  931. transformers/models/mpnet/configuration_mpnet.py +6 -2
  932. transformers/models/mpnet/modeling_mpnet.py +55 -57
  933. transformers/models/mpnet/tokenization_mpnet.py +1 -4
  934. transformers/models/mpt/configuration_mpt.py +17 -9
  935. transformers/models/mpt/modeling_mpt.py +58 -60
  936. transformers/models/mra/configuration_mra.py +8 -2
  937. transformers/models/mra/modeling_mra.py +54 -56
  938. transformers/models/mt5/configuration_mt5.py +9 -6
  939. transformers/models/mt5/modeling_mt5.py +80 -85
  940. transformers/models/musicgen/configuration_musicgen.py +12 -8
  941. transformers/models/musicgen/modeling_musicgen.py +114 -116
  942. transformers/models/musicgen/processing_musicgen.py +3 -21
  943. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -8
  944. transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +8 -9
  945. transformers/models/musicgen_melody/modeling_musicgen_melody.py +113 -126
  946. transformers/models/musicgen_melody/processing_musicgen_melody.py +3 -22
  947. transformers/models/mvp/configuration_mvp.py +8 -5
  948. transformers/models/mvp/modeling_mvp.py +121 -123
  949. transformers/models/myt5/tokenization_myt5.py +8 -10
  950. transformers/models/nanochat/configuration_nanochat.py +5 -8
  951. transformers/models/nanochat/modeling_nanochat.py +36 -39
  952. transformers/models/nanochat/modular_nanochat.py +16 -18
  953. transformers/models/nemotron/configuration_nemotron.py +25 -30
  954. transformers/models/nemotron/modeling_nemotron.py +53 -66
  955. transformers/models/nllb/tokenization_nllb.py +14 -14
  956. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -10
  957. transformers/models/nllb_moe/modeling_nllb_moe.py +70 -72
  958. transformers/models/nougat/image_processing_nougat.py +29 -32
  959. transformers/models/nougat/image_processing_nougat_fast.py +12 -13
  960. transformers/models/nougat/processing_nougat.py +37 -39
  961. transformers/models/nougat/tokenization_nougat.py +5 -7
  962. transformers/models/nystromformer/configuration_nystromformer.py +8 -2
  963. transformers/models/nystromformer/modeling_nystromformer.py +61 -63
  964. transformers/models/olmo/configuration_olmo.py +23 -28
  965. transformers/models/olmo/modeling_olmo.py +35 -38
  966. transformers/models/olmo/modular_olmo.py +8 -12
  967. transformers/models/olmo2/configuration_olmo2.py +27 -32
  968. transformers/models/olmo2/modeling_olmo2.py +36 -39
  969. transformers/models/olmo2/modular_olmo2.py +36 -38
  970. transformers/models/olmo3/__init__.py +0 -1
  971. transformers/models/olmo3/configuration_olmo3.py +30 -34
  972. transformers/models/olmo3/modeling_olmo3.py +35 -38
  973. transformers/models/olmo3/modular_olmo3.py +44 -47
  974. transformers/models/olmoe/configuration_olmoe.py +29 -33
  975. transformers/models/olmoe/modeling_olmoe.py +41 -43
  976. transformers/models/olmoe/modular_olmoe.py +15 -16
  977. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -50
  978. transformers/models/omdet_turbo/modeling_omdet_turbo.py +59 -57
  979. transformers/models/omdet_turbo/processing_omdet_turbo.py +19 -67
  980. transformers/models/oneformer/configuration_oneformer.py +11 -51
  981. transformers/models/oneformer/image_processing_oneformer.py +83 -84
  982. transformers/models/oneformer/image_processing_oneformer_fast.py +41 -42
  983. transformers/models/oneformer/modeling_oneformer.py +137 -133
  984. transformers/models/oneformer/processing_oneformer.py +28 -43
  985. transformers/models/openai/configuration_openai.py +16 -1
  986. transformers/models/openai/modeling_openai.py +50 -51
  987. transformers/models/openai/tokenization_openai.py +2 -5
  988. transformers/models/opt/configuration_opt.py +6 -7
  989. transformers/models/opt/modeling_opt.py +79 -80
  990. transformers/models/ovis2/__init__.py +0 -1
  991. transformers/models/ovis2/configuration_ovis2.py +4 -1
  992. transformers/models/ovis2/image_processing_ovis2.py +22 -24
  993. transformers/models/ovis2/image_processing_ovis2_fast.py +9 -10
  994. transformers/models/ovis2/modeling_ovis2.py +99 -142
  995. transformers/models/ovis2/modular_ovis2.py +82 -45
  996. transformers/models/ovis2/processing_ovis2.py +12 -40
  997. transformers/models/owlv2/configuration_owlv2.py +4 -2
  998. transformers/models/owlv2/image_processing_owlv2.py +20 -21
  999. transformers/models/owlv2/image_processing_owlv2_fast.py +12 -13
  1000. transformers/models/owlv2/modeling_owlv2.py +122 -114
  1001. transformers/models/owlv2/modular_owlv2.py +11 -12
  1002. transformers/models/owlv2/processing_owlv2.py +20 -49
  1003. transformers/models/owlvit/configuration_owlvit.py +4 -2
  1004. transformers/models/owlvit/image_processing_owlvit.py +21 -22
  1005. transformers/models/owlvit/image_processing_owlvit_fast.py +2 -3
  1006. transformers/models/owlvit/modeling_owlvit.py +121 -113
  1007. transformers/models/owlvit/processing_owlvit.py +20 -48
  1008. transformers/models/paddleocr_vl/__init__.py +0 -1
  1009. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +28 -29
  1010. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +34 -35
  1011. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +12 -12
  1012. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +159 -158
  1013. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +148 -119
  1014. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +1 -3
  1015. transformers/models/paligemma/configuration_paligemma.py +4 -1
  1016. transformers/models/paligemma/modeling_paligemma.py +81 -79
  1017. transformers/models/paligemma/processing_paligemma.py +13 -66
  1018. transformers/models/parakeet/configuration_parakeet.py +3 -8
  1019. transformers/models/parakeet/feature_extraction_parakeet.py +10 -12
  1020. transformers/models/parakeet/modeling_parakeet.py +21 -25
  1021. transformers/models/parakeet/modular_parakeet.py +19 -21
  1022. transformers/models/parakeet/processing_parakeet.py +12 -5
  1023. transformers/models/parakeet/tokenization_parakeet.py +2 -4
  1024. transformers/models/patchtsmixer/configuration_patchtsmixer.py +5 -8
  1025. transformers/models/patchtsmixer/modeling_patchtsmixer.py +63 -65
  1026. transformers/models/patchtst/configuration_patchtst.py +6 -9
  1027. transformers/models/patchtst/modeling_patchtst.py +75 -77
  1028. transformers/models/pe_audio/__init__.py +0 -1
  1029. transformers/models/pe_audio/configuration_pe_audio.py +14 -16
  1030. transformers/models/pe_audio/feature_extraction_pe_audio.py +6 -8
  1031. transformers/models/pe_audio/modeling_pe_audio.py +30 -31
  1032. transformers/models/pe_audio/modular_pe_audio.py +17 -18
  1033. transformers/models/pe_audio/processing_pe_audio.py +0 -1
  1034. transformers/models/pe_audio_video/__init__.py +0 -1
  1035. transformers/models/pe_audio_video/configuration_pe_audio_video.py +15 -17
  1036. transformers/models/pe_audio_video/modeling_pe_audio_video.py +64 -65
  1037. transformers/models/pe_audio_video/modular_pe_audio_video.py +56 -57
  1038. transformers/models/pe_audio_video/processing_pe_audio_video.py +0 -1
  1039. transformers/models/pe_video/__init__.py +0 -1
  1040. transformers/models/pe_video/configuration_pe_video.py +14 -16
  1041. transformers/models/pe_video/modeling_pe_video.py +57 -46
  1042. transformers/models/pe_video/modular_pe_video.py +47 -35
  1043. transformers/models/pe_video/video_processing_pe_video.py +2 -4
  1044. transformers/models/pegasus/configuration_pegasus.py +8 -6
  1045. transformers/models/pegasus/modeling_pegasus.py +67 -69
  1046. transformers/models/pegasus/tokenization_pegasus.py +1 -4
  1047. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -4
  1048. transformers/models/pegasus_x/modeling_pegasus_x.py +53 -55
  1049. transformers/models/perceiver/configuration_perceiver.py +0 -1
  1050. transformers/models/perceiver/image_processing_perceiver.py +22 -25
  1051. transformers/models/perceiver/image_processing_perceiver_fast.py +7 -8
  1052. transformers/models/perceiver/modeling_perceiver.py +152 -145
  1053. transformers/models/perceiver/tokenization_perceiver.py +3 -6
  1054. transformers/models/perception_lm/configuration_perception_lm.py +0 -1
  1055. transformers/models/perception_lm/image_processing_perception_lm_fast.py +8 -9
  1056. transformers/models/perception_lm/modeling_perception_lm.py +64 -67
  1057. transformers/models/perception_lm/modular_perception_lm.py +58 -58
  1058. transformers/models/perception_lm/processing_perception_lm.py +13 -47
  1059. transformers/models/perception_lm/video_processing_perception_lm.py +0 -1
  1060. transformers/models/persimmon/configuration_persimmon.py +23 -28
  1061. transformers/models/persimmon/modeling_persimmon.py +44 -47
  1062. transformers/models/phi/configuration_phi.py +27 -28
  1063. transformers/models/phi/modeling_phi.py +39 -41
  1064. transformers/models/phi/modular_phi.py +26 -26
  1065. transformers/models/phi3/configuration_phi3.py +32 -37
  1066. transformers/models/phi3/modeling_phi3.py +37 -40
  1067. transformers/models/phi3/modular_phi3.py +16 -20
  1068. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +36 -39
  1069. transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +7 -9
  1070. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +11 -11
  1071. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +100 -117
  1072. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +103 -90
  1073. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +7 -42
  1074. transformers/models/phimoe/configuration_phimoe.py +31 -36
  1075. transformers/models/phimoe/modeling_phimoe.py +50 -77
  1076. transformers/models/phimoe/modular_phimoe.py +12 -8
  1077. transformers/models/phobert/tokenization_phobert.py +4 -6
  1078. transformers/models/pix2struct/configuration_pix2struct.py +12 -10
  1079. transformers/models/pix2struct/image_processing_pix2struct.py +15 -19
  1080. transformers/models/pix2struct/image_processing_pix2struct_fast.py +12 -15
  1081. transformers/models/pix2struct/modeling_pix2struct.py +56 -52
  1082. transformers/models/pix2struct/processing_pix2struct.py +5 -26
  1083. transformers/models/pixio/__init__.py +0 -1
  1084. transformers/models/pixio/configuration_pixio.py +2 -5
  1085. transformers/models/pixio/modeling_pixio.py +16 -17
  1086. transformers/models/pixio/modular_pixio.py +7 -8
  1087. transformers/models/pixtral/configuration_pixtral.py +11 -14
  1088. transformers/models/pixtral/image_processing_pixtral.py +26 -28
  1089. transformers/models/pixtral/image_processing_pixtral_fast.py +10 -11
  1090. transformers/models/pixtral/modeling_pixtral.py +31 -37
  1091. transformers/models/pixtral/processing_pixtral.py +18 -52
  1092. transformers/models/plbart/configuration_plbart.py +8 -6
  1093. transformers/models/plbart/modeling_plbart.py +109 -109
  1094. transformers/models/plbart/modular_plbart.py +31 -33
  1095. transformers/models/plbart/tokenization_plbart.py +4 -5
  1096. transformers/models/poolformer/configuration_poolformer.py +0 -1
  1097. transformers/models/poolformer/image_processing_poolformer.py +21 -24
  1098. transformers/models/poolformer/image_processing_poolformer_fast.py +13 -14
  1099. transformers/models/poolformer/modeling_poolformer.py +10 -12
  1100. transformers/models/pop2piano/configuration_pop2piano.py +7 -7
  1101. transformers/models/pop2piano/feature_extraction_pop2piano.py +6 -9
  1102. transformers/models/pop2piano/modeling_pop2piano.py +24 -24
  1103. transformers/models/pop2piano/processing_pop2piano.py +25 -33
  1104. transformers/models/pop2piano/tokenization_pop2piano.py +15 -23
  1105. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  1106. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  1107. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  1108. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  1109. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  1110. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +13 -46
  1111. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
  1112. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +20 -21
  1113. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +17 -16
  1114. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +21 -20
  1115. transformers/models/prophetnet/configuration_prophetnet.py +37 -38
  1116. transformers/models/prophetnet/modeling_prophetnet.py +121 -153
  1117. transformers/models/prophetnet/tokenization_prophetnet.py +14 -16
  1118. transformers/models/pvt/configuration_pvt.py +0 -1
  1119. transformers/models/pvt/image_processing_pvt.py +24 -27
  1120. transformers/models/pvt/image_processing_pvt_fast.py +1 -2
  1121. transformers/models/pvt/modeling_pvt.py +19 -21
  1122. transformers/models/pvt_v2/configuration_pvt_v2.py +4 -8
  1123. transformers/models/pvt_v2/modeling_pvt_v2.py +27 -28
  1124. transformers/models/qwen2/configuration_qwen2.py +32 -25
  1125. transformers/models/qwen2/modeling_qwen2.py +35 -37
  1126. transformers/models/qwen2/modular_qwen2.py +14 -15
  1127. transformers/models/qwen2/tokenization_qwen2.py +2 -9
  1128. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +36 -27
  1129. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +241 -214
  1130. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +228 -193
  1131. transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +41 -49
  1132. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +28 -34
  1133. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +188 -145
  1134. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +64 -91
  1135. transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +7 -43
  1136. transformers/models/qwen2_audio/configuration_qwen2_audio.py +0 -1
  1137. transformers/models/qwen2_audio/modeling_qwen2_audio.py +39 -41
  1138. transformers/models/qwen2_audio/processing_qwen2_audio.py +13 -42
  1139. transformers/models/qwen2_moe/configuration_qwen2_moe.py +42 -35
  1140. transformers/models/qwen2_moe/modeling_qwen2_moe.py +40 -43
  1141. transformers/models/qwen2_moe/modular_qwen2_moe.py +10 -13
  1142. transformers/models/qwen2_vl/configuration_qwen2_vl.py +28 -33
  1143. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +38 -40
  1144. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +12 -15
  1145. transformers/models/qwen2_vl/modeling_qwen2_vl.py +184 -141
  1146. transformers/models/qwen2_vl/processing_qwen2_vl.py +7 -44
  1147. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +38 -18
  1148. transformers/models/qwen3/configuration_qwen3.py +34 -27
  1149. transformers/models/qwen3/modeling_qwen3.py +35 -38
  1150. transformers/models/qwen3/modular_qwen3.py +7 -9
  1151. transformers/models/qwen3_moe/configuration_qwen3_moe.py +45 -35
  1152. transformers/models/qwen3_moe/modeling_qwen3_moe.py +40 -43
  1153. transformers/models/qwen3_moe/modular_qwen3_moe.py +10 -13
  1154. transformers/models/qwen3_next/configuration_qwen3_next.py +47 -38
  1155. transformers/models/qwen3_next/modeling_qwen3_next.py +44 -47
  1156. transformers/models/qwen3_next/modular_qwen3_next.py +37 -38
  1157. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +139 -106
  1158. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +266 -206
  1159. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +228 -181
  1160. transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +40 -48
  1161. transformers/models/qwen3_vl/configuration_qwen3_vl.py +22 -24
  1162. transformers/models/qwen3_vl/modeling_qwen3_vl.py +185 -122
  1163. transformers/models/qwen3_vl/modular_qwen3_vl.py +153 -139
  1164. transformers/models/qwen3_vl/processing_qwen3_vl.py +6 -42
  1165. transformers/models/qwen3_vl/video_processing_qwen3_vl.py +10 -12
  1166. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +27 -30
  1167. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +249 -178
  1168. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +55 -42
  1169. transformers/models/rag/configuration_rag.py +6 -7
  1170. transformers/models/rag/modeling_rag.py +119 -121
  1171. transformers/models/rag/retrieval_rag.py +3 -5
  1172. transformers/models/rag/tokenization_rag.py +0 -50
  1173. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +29 -30
  1174. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +35 -39
  1175. transformers/models/reformer/configuration_reformer.py +7 -8
  1176. transformers/models/reformer/modeling_reformer.py +67 -68
  1177. transformers/models/reformer/tokenization_reformer.py +3 -6
  1178. transformers/models/regnet/configuration_regnet.py +0 -1
  1179. transformers/models/regnet/modeling_regnet.py +7 -9
  1180. transformers/models/rembert/configuration_rembert.py +8 -2
  1181. transformers/models/rembert/modeling_rembert.py +108 -132
  1182. transformers/models/rembert/tokenization_rembert.py +1 -4
  1183. transformers/models/resnet/configuration_resnet.py +2 -5
  1184. transformers/models/resnet/modeling_resnet.py +14 -15
  1185. transformers/models/roberta/configuration_roberta.py +11 -3
  1186. transformers/models/roberta/modeling_roberta.py +97 -99
  1187. transformers/models/roberta/modular_roberta.py +55 -58
  1188. transformers/models/roberta/tokenization_roberta.py +2 -5
  1189. transformers/models/roberta/tokenization_roberta_old.py +2 -4
  1190. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -3
  1191. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +97 -99
  1192. transformers/models/roc_bert/configuration_roc_bert.py +8 -2
  1193. transformers/models/roc_bert/modeling_roc_bert.py +125 -162
  1194. transformers/models/roc_bert/tokenization_roc_bert.py +88 -94
  1195. transformers/models/roformer/configuration_roformer.py +13 -3
  1196. transformers/models/roformer/modeling_roformer.py +79 -95
  1197. transformers/models/roformer/tokenization_roformer.py +3 -6
  1198. transformers/models/roformer/tokenization_utils.py +0 -1
  1199. transformers/models/rt_detr/configuration_rt_detr.py +8 -50
  1200. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -5
  1201. transformers/models/rt_detr/image_processing_rt_detr.py +54 -55
  1202. transformers/models/rt_detr/image_processing_rt_detr_fast.py +39 -26
  1203. transformers/models/rt_detr/modeling_rt_detr.py +643 -804
  1204. transformers/models/rt_detr/modeling_rt_detr_resnet.py +4 -7
  1205. transformers/models/rt_detr/modular_rt_detr.py +1522 -20
  1206. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -58
  1207. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +384 -521
  1208. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +27 -70
  1209. transformers/models/rwkv/configuration_rwkv.py +2 -4
  1210. transformers/models/rwkv/modeling_rwkv.py +29 -54
  1211. transformers/models/sam/configuration_sam.py +2 -1
  1212. transformers/models/sam/image_processing_sam.py +59 -60
  1213. transformers/models/sam/image_processing_sam_fast.py +25 -26
  1214. transformers/models/sam/modeling_sam.py +46 -43
  1215. transformers/models/sam/processing_sam.py +39 -27
  1216. transformers/models/sam2/configuration_sam2.py +1 -2
  1217. transformers/models/sam2/image_processing_sam2_fast.py +14 -15
  1218. transformers/models/sam2/modeling_sam2.py +96 -94
  1219. transformers/models/sam2/modular_sam2.py +85 -94
  1220. transformers/models/sam2/processing_sam2.py +31 -47
  1221. transformers/models/sam2_video/configuration_sam2_video.py +0 -1
  1222. transformers/models/sam2_video/modeling_sam2_video.py +114 -116
  1223. transformers/models/sam2_video/modular_sam2_video.py +72 -89
  1224. transformers/models/sam2_video/processing_sam2_video.py +49 -66
  1225. transformers/models/sam2_video/video_processing_sam2_video.py +1 -4
  1226. transformers/models/sam3/configuration_sam3.py +0 -1
  1227. transformers/models/sam3/image_processing_sam3_fast.py +17 -20
  1228. transformers/models/sam3/modeling_sam3.py +94 -100
  1229. transformers/models/sam3/modular_sam3.py +3 -8
  1230. transformers/models/sam3/processing_sam3.py +37 -52
  1231. transformers/models/sam3_tracker/__init__.py +0 -1
  1232. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -3
  1233. transformers/models/sam3_tracker/modeling_sam3_tracker.py +79 -80
  1234. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -2
  1235. transformers/models/sam3_tracker/processing_sam3_tracker.py +31 -48
  1236. transformers/models/sam3_tracker_video/__init__.py +0 -1
  1237. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +0 -1
  1238. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +115 -114
  1239. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -24
  1240. transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +50 -66
  1241. transformers/models/sam3_video/configuration_sam3_video.py +0 -1
  1242. transformers/models/sam3_video/modeling_sam3_video.py +56 -45
  1243. transformers/models/sam3_video/processing_sam3_video.py +25 -45
  1244. transformers/models/sam_hq/__init__.py +1 -1
  1245. transformers/models/sam_hq/configuration_sam_hq.py +2 -1
  1246. transformers/models/sam_hq/modeling_sam_hq.py +52 -50
  1247. transformers/models/sam_hq/modular_sam_hq.py +23 -25
  1248. transformers/models/sam_hq/{processing_samhq.py → processing_sam_hq.py} +41 -29
  1249. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -10
  1250. transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +8 -11
  1251. transformers/models/seamless_m4t/modeling_seamless_m4t.py +180 -182
  1252. transformers/models/seamless_m4t/processing_seamless_m4t.py +18 -39
  1253. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +15 -20
  1254. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -10
  1255. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +193 -195
  1256. transformers/models/seed_oss/configuration_seed_oss.py +30 -34
  1257. transformers/models/seed_oss/modeling_seed_oss.py +34 -36
  1258. transformers/models/seed_oss/modular_seed_oss.py +6 -7
  1259. transformers/models/segformer/configuration_segformer.py +0 -10
  1260. transformers/models/segformer/image_processing_segformer.py +39 -42
  1261. transformers/models/segformer/image_processing_segformer_fast.py +11 -12
  1262. transformers/models/segformer/modeling_segformer.py +28 -28
  1263. transformers/models/segformer/modular_segformer.py +8 -9
  1264. transformers/models/seggpt/configuration_seggpt.py +0 -1
  1265. transformers/models/seggpt/image_processing_seggpt.py +38 -41
  1266. transformers/models/seggpt/modeling_seggpt.py +48 -38
  1267. transformers/models/sew/configuration_sew.py +4 -2
  1268. transformers/models/sew/modeling_sew.py +42 -40
  1269. transformers/models/sew/modular_sew.py +12 -13
  1270. transformers/models/sew_d/configuration_sew_d.py +4 -2
  1271. transformers/models/sew_d/modeling_sew_d.py +32 -31
  1272. transformers/models/shieldgemma2/configuration_shieldgemma2.py +0 -1
  1273. transformers/models/shieldgemma2/modeling_shieldgemma2.py +19 -21
  1274. transformers/models/shieldgemma2/processing_shieldgemma2.py +3 -5
  1275. transformers/models/siglip/configuration_siglip.py +4 -2
  1276. transformers/models/siglip/image_processing_siglip.py +17 -20
  1277. transformers/models/siglip/image_processing_siglip_fast.py +0 -1
  1278. transformers/models/siglip/modeling_siglip.py +65 -110
  1279. transformers/models/siglip/processing_siglip.py +2 -14
  1280. transformers/models/siglip/tokenization_siglip.py +6 -7
  1281. transformers/models/siglip2/__init__.py +1 -0
  1282. transformers/models/siglip2/configuration_siglip2.py +4 -2
  1283. transformers/models/siglip2/image_processing_siglip2.py +15 -16
  1284. transformers/models/siglip2/image_processing_siglip2_fast.py +6 -7
  1285. transformers/models/siglip2/modeling_siglip2.py +89 -130
  1286. transformers/models/siglip2/modular_siglip2.py +95 -48
  1287. transformers/models/siglip2/processing_siglip2.py +2 -14
  1288. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  1289. transformers/models/smollm3/configuration_smollm3.py +29 -32
  1290. transformers/models/smollm3/modeling_smollm3.py +35 -38
  1291. transformers/models/smollm3/modular_smollm3.py +36 -38
  1292. transformers/models/smolvlm/configuration_smolvlm.py +2 -4
  1293. transformers/models/smolvlm/image_processing_smolvlm.py +42 -43
  1294. transformers/models/smolvlm/image_processing_smolvlm_fast.py +41 -15
  1295. transformers/models/smolvlm/modeling_smolvlm.py +124 -96
  1296. transformers/models/smolvlm/modular_smolvlm.py +50 -39
  1297. transformers/models/smolvlm/processing_smolvlm.py +15 -76
  1298. transformers/models/smolvlm/video_processing_smolvlm.py +16 -17
  1299. transformers/models/solar_open/__init__.py +27 -0
  1300. transformers/models/solar_open/configuration_solar_open.py +184 -0
  1301. transformers/models/solar_open/modeling_solar_open.py +642 -0
  1302. transformers/models/solar_open/modular_solar_open.py +224 -0
  1303. transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +0 -1
  1304. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +26 -27
  1305. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -9
  1306. transformers/models/speech_to_text/feature_extraction_speech_to_text.py +10 -13
  1307. transformers/models/speech_to_text/modeling_speech_to_text.py +55 -57
  1308. transformers/models/speech_to_text/processing_speech_to_text.py +4 -30
  1309. transformers/models/speech_to_text/tokenization_speech_to_text.py +5 -6
  1310. transformers/models/speecht5/configuration_speecht5.py +7 -9
  1311. transformers/models/speecht5/feature_extraction_speecht5.py +16 -37
  1312. transformers/models/speecht5/modeling_speecht5.py +172 -174
  1313. transformers/models/speecht5/number_normalizer.py +0 -1
  1314. transformers/models/speecht5/processing_speecht5.py +3 -37
  1315. transformers/models/speecht5/tokenization_speecht5.py +4 -5
  1316. transformers/models/splinter/configuration_splinter.py +6 -7
  1317. transformers/models/splinter/modeling_splinter.py +62 -59
  1318. transformers/models/splinter/tokenization_splinter.py +2 -4
  1319. transformers/models/squeezebert/configuration_squeezebert.py +14 -2
  1320. transformers/models/squeezebert/modeling_squeezebert.py +60 -62
  1321. transformers/models/squeezebert/tokenization_squeezebert.py +0 -1
  1322. transformers/models/stablelm/configuration_stablelm.py +28 -29
  1323. transformers/models/stablelm/modeling_stablelm.py +44 -47
  1324. transformers/models/starcoder2/configuration_starcoder2.py +30 -27
  1325. transformers/models/starcoder2/modeling_starcoder2.py +38 -41
  1326. transformers/models/starcoder2/modular_starcoder2.py +17 -19
  1327. transformers/models/superglue/configuration_superglue.py +7 -3
  1328. transformers/models/superglue/image_processing_superglue.py +15 -15
  1329. transformers/models/superglue/image_processing_superglue_fast.py +8 -8
  1330. transformers/models/superglue/modeling_superglue.py +41 -37
  1331. transformers/models/superpoint/image_processing_superpoint.py +15 -15
  1332. transformers/models/superpoint/image_processing_superpoint_fast.py +7 -9
  1333. transformers/models/superpoint/modeling_superpoint.py +17 -16
  1334. transformers/models/swiftformer/configuration_swiftformer.py +0 -1
  1335. transformers/models/swiftformer/modeling_swiftformer.py +12 -14
  1336. transformers/models/swin/configuration_swin.py +2 -5
  1337. transformers/models/swin/modeling_swin.py +69 -78
  1338. transformers/models/swin2sr/configuration_swin2sr.py +0 -1
  1339. transformers/models/swin2sr/image_processing_swin2sr.py +10 -13
  1340. transformers/models/swin2sr/image_processing_swin2sr_fast.py +4 -7
  1341. transformers/models/swin2sr/modeling_swin2sr.py +30 -30
  1342. transformers/models/swinv2/configuration_swinv2.py +2 -5
  1343. transformers/models/swinv2/modeling_swinv2.py +65 -74
  1344. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -7
  1345. transformers/models/switch_transformers/modeling_switch_transformers.py +35 -36
  1346. transformers/models/switch_transformers/modular_switch_transformers.py +32 -33
  1347. transformers/models/t5/configuration_t5.py +9 -9
  1348. transformers/models/t5/modeling_t5.py +80 -85
  1349. transformers/models/t5/tokenization_t5.py +1 -3
  1350. transformers/models/t5gemma/configuration_t5gemma.py +43 -59
  1351. transformers/models/t5gemma/modeling_t5gemma.py +105 -108
  1352. transformers/models/t5gemma/modular_t5gemma.py +128 -142
  1353. transformers/models/t5gemma2/configuration_t5gemma2.py +86 -100
  1354. transformers/models/t5gemma2/modeling_t5gemma2.py +234 -194
  1355. transformers/models/t5gemma2/modular_t5gemma2.py +279 -264
  1356. transformers/models/table_transformer/configuration_table_transformer.py +18 -50
  1357. transformers/models/table_transformer/modeling_table_transformer.py +73 -101
  1358. transformers/models/tapas/configuration_tapas.py +12 -2
  1359. transformers/models/tapas/modeling_tapas.py +65 -67
  1360. transformers/models/tapas/tokenization_tapas.py +116 -153
  1361. transformers/models/textnet/configuration_textnet.py +4 -7
  1362. transformers/models/textnet/image_processing_textnet.py +22 -25
  1363. transformers/models/textnet/image_processing_textnet_fast.py +8 -9
  1364. transformers/models/textnet/modeling_textnet.py +28 -28
  1365. transformers/models/time_series_transformer/configuration_time_series_transformer.py +5 -8
  1366. transformers/models/time_series_transformer/modeling_time_series_transformer.py +82 -84
  1367. transformers/models/timesfm/configuration_timesfm.py +0 -1
  1368. transformers/models/timesfm/modeling_timesfm.py +22 -25
  1369. transformers/models/timesfm/modular_timesfm.py +21 -24
  1370. transformers/models/timesformer/configuration_timesformer.py +0 -1
  1371. transformers/models/timesformer/modeling_timesformer.py +13 -16
  1372. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -8
  1373. transformers/models/timm_backbone/modeling_timm_backbone.py +25 -30
  1374. transformers/models/timm_wrapper/configuration_timm_wrapper.py +2 -3
  1375. transformers/models/timm_wrapper/image_processing_timm_wrapper.py +4 -5
  1376. transformers/models/timm_wrapper/modeling_timm_wrapper.py +22 -19
  1377. transformers/models/trocr/configuration_trocr.py +11 -8
  1378. transformers/models/trocr/modeling_trocr.py +42 -42
  1379. transformers/models/trocr/processing_trocr.py +5 -25
  1380. transformers/models/tvp/configuration_tvp.py +10 -36
  1381. transformers/models/tvp/image_processing_tvp.py +50 -52
  1382. transformers/models/tvp/image_processing_tvp_fast.py +15 -15
  1383. transformers/models/tvp/modeling_tvp.py +26 -28
  1384. transformers/models/tvp/processing_tvp.py +2 -14
  1385. transformers/models/udop/configuration_udop.py +16 -8
  1386. transformers/models/udop/modeling_udop.py +73 -72
  1387. transformers/models/udop/processing_udop.py +7 -26
  1388. transformers/models/udop/tokenization_udop.py +80 -93
  1389. transformers/models/umt5/configuration_umt5.py +8 -7
  1390. transformers/models/umt5/modeling_umt5.py +87 -84
  1391. transformers/models/unispeech/configuration_unispeech.py +4 -2
  1392. transformers/models/unispeech/modeling_unispeech.py +54 -53
  1393. transformers/models/unispeech/modular_unispeech.py +20 -22
  1394. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -2
  1395. transformers/models/unispeech_sat/modeling_unispeech_sat.py +70 -69
  1396. transformers/models/unispeech_sat/modular_unispeech_sat.py +21 -23
  1397. transformers/models/univnet/feature_extraction_univnet.py +14 -14
  1398. transformers/models/univnet/modeling_univnet.py +7 -8
  1399. transformers/models/upernet/configuration_upernet.py +8 -36
  1400. transformers/models/upernet/modeling_upernet.py +11 -14
  1401. transformers/models/vaultgemma/__init__.py +0 -1
  1402. transformers/models/vaultgemma/configuration_vaultgemma.py +29 -33
  1403. transformers/models/vaultgemma/modeling_vaultgemma.py +38 -40
  1404. transformers/models/vaultgemma/modular_vaultgemma.py +29 -31
  1405. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  1406. transformers/models/video_llama_3/image_processing_video_llama_3.py +40 -40
  1407. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +12 -14
  1408. transformers/models/video_llama_3/modeling_video_llama_3.py +149 -112
  1409. transformers/models/video_llama_3/modular_video_llama_3.py +152 -150
  1410. transformers/models/video_llama_3/processing_video_llama_3.py +5 -39
  1411. transformers/models/video_llama_3/video_processing_video_llama_3.py +45 -24
  1412. transformers/models/video_llava/configuration_video_llava.py +4 -1
  1413. transformers/models/video_llava/image_processing_video_llava.py +35 -38
  1414. transformers/models/video_llava/modeling_video_llava.py +139 -143
  1415. transformers/models/video_llava/processing_video_llava.py +38 -78
  1416. transformers/models/video_llava/video_processing_video_llava.py +0 -1
  1417. transformers/models/videomae/configuration_videomae.py +0 -1
  1418. transformers/models/videomae/image_processing_videomae.py +31 -34
  1419. transformers/models/videomae/modeling_videomae.py +17 -20
  1420. transformers/models/videomae/video_processing_videomae.py +0 -1
  1421. transformers/models/vilt/configuration_vilt.py +4 -2
  1422. transformers/models/vilt/image_processing_vilt.py +29 -30
  1423. transformers/models/vilt/image_processing_vilt_fast.py +15 -16
  1424. transformers/models/vilt/modeling_vilt.py +103 -90
  1425. transformers/models/vilt/processing_vilt.py +2 -14
  1426. transformers/models/vipllava/configuration_vipllava.py +4 -1
  1427. transformers/models/vipllava/modeling_vipllava.py +92 -67
  1428. transformers/models/vipllava/modular_vipllava.py +78 -54
  1429. transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +0 -1
  1430. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +28 -27
  1431. transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +0 -1
  1432. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +45 -41
  1433. transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +2 -16
  1434. transformers/models/visual_bert/configuration_visual_bert.py +6 -2
  1435. transformers/models/visual_bert/modeling_visual_bert.py +90 -92
  1436. transformers/models/vit/configuration_vit.py +2 -3
  1437. transformers/models/vit/image_processing_vit.py +19 -22
  1438. transformers/models/vit/image_processing_vit_fast.py +0 -1
  1439. transformers/models/vit/modeling_vit.py +20 -20
  1440. transformers/models/vit_mae/configuration_vit_mae.py +0 -1
  1441. transformers/models/vit_mae/modeling_vit_mae.py +32 -30
  1442. transformers/models/vit_msn/configuration_vit_msn.py +0 -1
  1443. transformers/models/vit_msn/modeling_vit_msn.py +21 -19
  1444. transformers/models/vitdet/configuration_vitdet.py +2 -5
  1445. transformers/models/vitdet/modeling_vitdet.py +14 -17
  1446. transformers/models/vitmatte/configuration_vitmatte.py +7 -39
  1447. transformers/models/vitmatte/image_processing_vitmatte.py +15 -18
  1448. transformers/models/vitmatte/image_processing_vitmatte_fast.py +16 -17
  1449. transformers/models/vitmatte/modeling_vitmatte.py +10 -12
  1450. transformers/models/vitpose/configuration_vitpose.py +7 -47
  1451. transformers/models/vitpose/image_processing_vitpose.py +24 -25
  1452. transformers/models/vitpose/image_processing_vitpose_fast.py +9 -10
  1453. transformers/models/vitpose/modeling_vitpose.py +15 -15
  1454. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -5
  1455. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +13 -16
  1456. transformers/models/vits/configuration_vits.py +4 -1
  1457. transformers/models/vits/modeling_vits.py +43 -42
  1458. transformers/models/vits/tokenization_vits.py +3 -4
  1459. transformers/models/vivit/configuration_vivit.py +0 -1
  1460. transformers/models/vivit/image_processing_vivit.py +36 -39
  1461. transformers/models/vivit/modeling_vivit.py +9 -11
  1462. transformers/models/vjepa2/__init__.py +0 -1
  1463. transformers/models/vjepa2/configuration_vjepa2.py +0 -1
  1464. transformers/models/vjepa2/modeling_vjepa2.py +39 -41
  1465. transformers/models/vjepa2/video_processing_vjepa2.py +0 -1
  1466. transformers/models/voxtral/__init__.py +0 -1
  1467. transformers/models/voxtral/configuration_voxtral.py +0 -2
  1468. transformers/models/voxtral/modeling_voxtral.py +41 -48
  1469. transformers/models/voxtral/modular_voxtral.py +35 -38
  1470. transformers/models/voxtral/processing_voxtral.py +25 -48
  1471. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -2
  1472. transformers/models/wav2vec2/feature_extraction_wav2vec2.py +7 -10
  1473. transformers/models/wav2vec2/modeling_wav2vec2.py +74 -126
  1474. transformers/models/wav2vec2/processing_wav2vec2.py +6 -35
  1475. transformers/models/wav2vec2/tokenization_wav2vec2.py +20 -332
  1476. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -2
  1477. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +49 -52
  1478. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +45 -48
  1479. transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +6 -35
  1480. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -2
  1481. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +62 -65
  1482. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +15 -18
  1483. transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +16 -17
  1484. transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +36 -55
  1485. transformers/models/wavlm/configuration_wavlm.py +4 -2
  1486. transformers/models/wavlm/modeling_wavlm.py +49 -49
  1487. transformers/models/wavlm/modular_wavlm.py +4 -5
  1488. transformers/models/whisper/configuration_whisper.py +6 -5
  1489. transformers/models/whisper/english_normalizer.py +3 -4
  1490. transformers/models/whisper/feature_extraction_whisper.py +9 -24
  1491. transformers/models/whisper/generation_whisper.py +26 -49
  1492. transformers/models/whisper/modeling_whisper.py +71 -73
  1493. transformers/models/whisper/processing_whisper.py +3 -20
  1494. transformers/models/whisper/tokenization_whisper.py +9 -30
  1495. transformers/models/x_clip/configuration_x_clip.py +4 -2
  1496. transformers/models/x_clip/modeling_x_clip.py +94 -96
  1497. transformers/models/x_clip/processing_x_clip.py +2 -14
  1498. transformers/models/xcodec/configuration_xcodec.py +4 -6
  1499. transformers/models/xcodec/modeling_xcodec.py +15 -17
  1500. transformers/models/xglm/configuration_xglm.py +9 -8
  1501. transformers/models/xglm/modeling_xglm.py +49 -55
  1502. transformers/models/xglm/tokenization_xglm.py +1 -4
  1503. transformers/models/xlm/configuration_xlm.py +10 -8
  1504. transformers/models/xlm/modeling_xlm.py +127 -131
  1505. transformers/models/xlm/tokenization_xlm.py +3 -5
  1506. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -3
  1507. transformers/models/xlm_roberta/modeling_xlm_roberta.py +96 -98
  1508. transformers/models/xlm_roberta/modular_xlm_roberta.py +50 -53
  1509. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +1 -4
  1510. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -2
  1511. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +97 -99
  1512. transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +67 -70
  1513. transformers/models/xlnet/configuration_xlnet.py +3 -12
  1514. transformers/models/xlnet/modeling_xlnet.py +149 -162
  1515. transformers/models/xlnet/tokenization_xlnet.py +1 -4
  1516. transformers/models/xlstm/configuration_xlstm.py +8 -12
  1517. transformers/models/xlstm/modeling_xlstm.py +61 -96
  1518. transformers/models/xmod/configuration_xmod.py +11 -3
  1519. transformers/models/xmod/modeling_xmod.py +111 -116
  1520. transformers/models/yolos/configuration_yolos.py +0 -1
  1521. transformers/models/yolos/image_processing_yolos.py +60 -62
  1522. transformers/models/yolos/image_processing_yolos_fast.py +42 -45
  1523. transformers/models/yolos/modeling_yolos.py +19 -21
  1524. transformers/models/yolos/modular_yolos.py +17 -19
  1525. transformers/models/yoso/configuration_yoso.py +8 -2
  1526. transformers/models/yoso/modeling_yoso.py +60 -62
  1527. transformers/models/youtu/__init__.py +27 -0
  1528. transformers/models/youtu/configuration_youtu.py +194 -0
  1529. transformers/models/youtu/modeling_youtu.py +619 -0
  1530. transformers/models/youtu/modular_youtu.py +254 -0
  1531. transformers/models/zamba/configuration_zamba.py +5 -8
  1532. transformers/models/zamba/modeling_zamba.py +93 -125
  1533. transformers/models/zamba2/configuration_zamba2.py +44 -50
  1534. transformers/models/zamba2/modeling_zamba2.py +137 -165
  1535. transformers/models/zamba2/modular_zamba2.py +79 -74
  1536. transformers/models/zoedepth/configuration_zoedepth.py +17 -41
  1537. transformers/models/zoedepth/image_processing_zoedepth.py +28 -29
  1538. transformers/models/zoedepth/image_processing_zoedepth_fast.py +20 -21
  1539. transformers/models/zoedepth/modeling_zoedepth.py +19 -19
  1540. transformers/pipelines/__init__.py +47 -106
  1541. transformers/pipelines/any_to_any.py +15 -23
  1542. transformers/pipelines/audio_utils.py +1 -2
  1543. transformers/pipelines/automatic_speech_recognition.py +0 -2
  1544. transformers/pipelines/base.py +13 -17
  1545. transformers/pipelines/image_text_to_text.py +1 -2
  1546. transformers/pipelines/question_answering.py +4 -43
  1547. transformers/pipelines/text_classification.py +1 -14
  1548. transformers/pipelines/text_to_audio.py +5 -1
  1549. transformers/pipelines/token_classification.py +1 -22
  1550. transformers/pipelines/video_classification.py +1 -9
  1551. transformers/pipelines/zero_shot_audio_classification.py +0 -1
  1552. transformers/pipelines/zero_shot_classification.py +0 -6
  1553. transformers/pipelines/zero_shot_image_classification.py +0 -7
  1554. transformers/processing_utils.py +128 -137
  1555. transformers/pytorch_utils.py +2 -26
  1556. transformers/quantizers/base.py +10 -0
  1557. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  1558. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  1559. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  1560. transformers/quantizers/quantizer_mxfp4.py +1 -1
  1561. transformers/quantizers/quantizer_quark.py +0 -1
  1562. transformers/quantizers/quantizer_torchao.py +3 -19
  1563. transformers/safetensors_conversion.py +11 -4
  1564. transformers/testing_utils.py +6 -65
  1565. transformers/tokenization_mistral_common.py +563 -903
  1566. transformers/tokenization_python.py +6 -4
  1567. transformers/tokenization_utils_base.py +228 -341
  1568. transformers/tokenization_utils_sentencepiece.py +5 -6
  1569. transformers/tokenization_utils_tokenizers.py +36 -7
  1570. transformers/trainer.py +30 -41
  1571. transformers/trainer_jit_checkpoint.py +1 -2
  1572. transformers/trainer_seq2seq.py +1 -1
  1573. transformers/training_args.py +414 -420
  1574. transformers/utils/__init__.py +1 -4
  1575. transformers/utils/attention_visualizer.py +1 -1
  1576. transformers/utils/auto_docstring.py +567 -18
  1577. transformers/utils/backbone_utils.py +13 -373
  1578. transformers/utils/doc.py +4 -36
  1579. transformers/utils/dummy_pt_objects.py +0 -42
  1580. transformers/utils/generic.py +70 -34
  1581. transformers/utils/import_utils.py +72 -75
  1582. transformers/utils/loading_report.py +135 -107
  1583. transformers/utils/quantization_config.py +8 -31
  1584. transformers/video_processing_utils.py +24 -25
  1585. transformers/video_utils.py +21 -23
  1586. {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/METADATA +120 -239
  1587. transformers-5.1.0.dist-info/RECORD +2092 -0
  1588. {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1589. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1590. transformers/pipelines/image_to_text.py +0 -229
  1591. transformers-5.0.0rc2.dist-info/RECORD +0 -2042
  1592. {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1593. {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1594. {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -14,39 +14,39 @@
14
14
  import os
15
15
  import re
16
16
  import shutil
17
- import warnings
18
- from collections.abc import Callable, Mapping, Sized
17
+ from collections.abc import Callable, Sequence
19
18
  from enum import Enum
20
19
  from pathlib import Path
21
- from typing import Any, Union, overload
20
+ from typing import Any, Literal, Union, overload
22
21
 
23
22
  import numpy as np
24
23
  from huggingface_hub import create_repo
25
24
 
26
25
  from transformers.audio_utils import load_audio_as
27
26
  from transformers.tokenization_utils_base import (
28
- LARGE_INTEGER,
29
27
  VERY_LARGE_INTEGER,
28
+ AddedToken,
30
29
  BatchEncoding,
31
30
  EncodedInput,
32
31
  PreTokenizedInput,
32
+ PreTrainedTokenizerBase,
33
33
  TextInput,
34
34
  TruncationStrategy,
35
35
  )
36
36
  from transformers.utils import PaddingStrategy, TensorType, add_end_docstrings, logging, to_py_obj
37
- from transformers.utils.generic import is_torch_tensor
38
- from transformers.utils.hub import PushToHubMixin
39
37
  from transformers.utils.import_utils import is_mistral_common_available, is_torch_available, requires
40
38
 
41
39
 
42
40
  if is_mistral_common_available():
43
41
  from mistral_common.protocol.instruct.request import ChatCompletionRequest
44
42
  from mistral_common.protocol.instruct.validator import ValidationMode
45
- from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, TokenizerVersion
46
- from mistral_common.tokens.tokenizers.image import MultiModalVersion
43
+ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, SpecialTokens
47
44
  from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
48
45
  from mistral_common.tokens.tokenizers.tekken import Tekkenizer
49
- from mistral_common.tokens.tokenizers.utils import download_tokenizer_from_hf_hub
46
+ from mistral_common.tokens.tokenizers.utils import (
47
+ download_tokenizer_from_hf_hub,
48
+ get_one_valid_tokenizer_file,
49
+ )
50
50
 
51
51
 
52
52
  if is_torch_available():
@@ -103,6 +103,10 @@ ENCODE_KWARGS_DOCSTRING = r"""
103
103
  """
104
104
 
105
105
  ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
106
+ return_token_type_ids (`bool`, *optional*):
107
+ Whether to return token type IDs. For `MistralCommonBackend` it returns a list of zeros of the sequence length as only one sequence is supported.
108
+
109
+ [What are token type IDs?](../glossary#token-type-ids)
106
110
  return_attention_mask (`bool`, *optional*):
107
111
  Whether to return the attention mask. If left to the default, will return the attention mask according
108
112
  to the specific tokenizer's default, defined by the `return_outputs` attribute.
@@ -118,6 +122,8 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
118
122
  Whether or not to return the lengths of the encoded inputs.
119
123
  verbose (`bool`, *optional*, defaults to `True`):
120
124
  Whether or not to print more information and warnings.
125
+ return_offsets_mapping (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
126
+ split_special_tokens (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
121
127
  **kwargs: passed to the `self.tokenize()` method
122
128
 
123
129
  Return:
@@ -149,8 +155,35 @@ class MistralTokenizerType(str, Enum):
149
155
  tekken = "tekken"
150
156
 
151
157
 
158
+ @overload
159
+ def _maybe_remove_lang(text: str, skip_special_tokens: bool) -> str: ...
160
+ @overload
161
+ def _maybe_remove_lang(text: list[str], skip_special_tokens: bool) -> list[str]: ...
162
+ def _maybe_remove_lang(text: str | list[str], skip_special_tokens: bool) -> str | list[str]:
163
+ # in the specific case of Voxtral, the added f"lang:xx" (always a two char language code since it follows ISO 639-1 alpha-2 format)
164
+ # is not considered as a special token by mistral-common and is encoded/ decoded as normal text.
165
+ # Nevertheless we should remove it to ease users life.
166
+ if not skip_special_tokens:
167
+ return text
168
+
169
+ if isinstance(text, str):
170
+ return re.sub(r"^lang:[a-z]{2}", "", text)
171
+
172
+ return [re.sub(r"^lang:[a-z]{2}", "", string) for string in text]
173
+
174
+
175
+ _MAP_SPECIAL_TOKENS = {
176
+ "bos_token": SpecialTokens.bos.value,
177
+ "eos_token": SpecialTokens.eos.value,
178
+ "pad_token": SpecialTokens.pad.value,
179
+ "unk_token": SpecialTokens.unk.value,
180
+ }
181
+
182
+ _VALID_INIT_KWARGS = {"_from_auto", "backend", "files_loaded"}
183
+
184
+
152
185
  @requires(backends=("mistral-common",))
153
- class MistralCommonBackend(PushToHubMixin):
186
+ class MistralCommonBackend(PreTrainedTokenizerBase):
154
187
  """
155
188
  Class to wrap `mistral-common` tokenizers.
156
189
 
@@ -165,34 +198,13 @@ class MistralCommonBackend(PushToHubMixin):
165
198
  For more info on `mistral-common`, see [mistral-common](https://github.com/mistralai/mistral-common).
166
199
 
167
200
  This class is a wrapper around a `mistral_common.tokens.tokenizers.mistral.MistralTokenizer`.
168
- It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer.
169
-
170
- Supports the following methods from the `PreTrainedTokenizerBase` class:
171
-
172
- - [`~MistralCommonBackend.get_vocab`]: Returns the vocabulary as a dictionary of token to index.
173
- This is a lossy conversion for Tekkenizer as some decoding errors are collapsed into the same token.
174
- - [`~MistralCommonBackend.encode`]: Encode a string to a list of integers.
175
- - [`~MistralCommonBackend.decode`]: Decode a list of integers to a string.
176
- - [`~MistralCommonBackend.batch_decode`]: Decode a batch of list of integers to a list of strings.
177
- - [`~MistralCommonBackend.convert_tokens_to_ids`]: Convert a list of tokens to a list of integers.
178
- - [`~MistralCommonBackend.convert_ids_to_tokens`]: Convert a list of integers to a list of tokens.
179
- - [`~MistralCommonBackend.tokenize`]: Tokenize a string.
180
- - [`~MistralCommonBackend.get_special_tokens_mask`]: Get the special tokens mask for a list of tokens.
181
- - [`~MistralCommonBackend.prepare_for_model`]: Prepare a list of inputs for the model.
182
- - [`~MistralCommonBackend.pad`]: Pad a list of inputs to the same length.
183
- - [`~MistralCommonBackend.truncate_sequences`]: Truncate a list of sequences to the same length.
184
- - [`~MistralCommonBackend.apply_chat_template`]: Apply a chat template to a list of messages.
185
- - [`~MistralCommonBackend.__call__`]: Tokenize a string or a list of strings.
186
- - [`~MistralCommonBackend.from_pretrained`]: Download and cache a pretrained tokenizer from the Hugging Face model hub or local directory.
187
- - [`~MistralCommonBackend.save_pretrained`]: Save a tokenizer to a directory, so it can be reloaded using the `from_pretrained` class method.
188
- - [`~MistralCommonBackend.push_to_hub`]: Upload tokenizer to the Hugging Face model hub.
189
-
190
- Here are the key differences with the `PreTrainedTokenizerBase` class:
191
-
192
- - Pair of sequences are not supported. The signature have been kept for compatibility but all arguments related to pair of sequences are ignored. The return values of pairs are returned as `None`.
201
+ It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer and inherits from the `PreTrainedTokenizerBase` class.
202
+
203
+ Here are the key behavior differences with the `PythonBackend` class:
204
+
205
+ - Pair of sequences are not supported. The signature has been kept for compatibility but all arguments related to pair of sequences are ignored. The return values for pairs are returned as `None`.
193
206
  - The `is_split_into_words` argument is not supported.
194
- - The `return_token_type_ids` argument is not supported.
195
- - It is not possible to add new tokens to the tokenizer. Also the special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("<s>")` will not return the ID of the `<s>` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `"<s>"`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens).
207
+ - It is not possible to add new tokens to the tokenizer. Special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("<s>")` will not return the ID of the `<s>` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `"<s>"`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens).
196
208
 
197
209
  If you have suggestions to improve this class, please open an issue on the [mistral-common GitHub repository](https://github.com/mistralai/mistral-common/issues) if it is related to the tokenizer or on the [Transformers GitHub repository](https://github.com/huggingface/transformers/issues) if it is related to the Hugging Face interface.
198
210
  """
@@ -200,6 +212,12 @@ class MistralCommonBackend(PushToHubMixin):
200
212
  model_input_names: list[str] = ["input_ids", "attention_mask"]
201
213
  padding_side: str = "left"
202
214
  truncation_side: str = "right"
215
+ SPECIAL_TOKENS_ATTRIBUTES = [
216
+ "bos_token",
217
+ "eos_token",
218
+ "unk_token",
219
+ "pad_token",
220
+ ]
203
221
 
204
222
  def __init__(
205
223
  self,
@@ -226,7 +244,7 @@ class MistralCommonBackend(PushToHubMixin):
226
244
  Path to the tokenizer file to load the `MistralTokenizer`.
227
245
  mode (`Union[str, ValidationMode]`, *optional*, defaults to `ValidationMode.test`):
228
246
  The mode to use for the tokenizer. This will be passed to the `MistralTokenizer` constructor. Possible values are:
229
- - `"finetuning"` or `ValidationMode.finetuning`: The finetuning mode.
247
+ - `"finetuning"` or `ValidationMode.finetuning`: The fine-tuning mode.
230
248
  - `"test"` or `ValidationMode.test`: The test mode.
231
249
  It changes how the tokenizer validates the input and prepares the request to the model.
232
250
  model_max_length (`int`, *optional*):
@@ -240,60 +258,49 @@ class MistralCommonBackend(PushToHubMixin):
240
258
  truncation_side (`str`, *optional*):
241
259
  The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
242
260
  Default value is picked from the class attribute of the same name.
243
- model_input_names (`List[string]`, *optional*):
261
+ model_input_names (`List[str]`, *optional*):
244
262
  The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
245
263
  `"attention_mask"`). Default value is picked from the class attribute of the same name.
246
264
  clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
247
- Whether or not the model should cleanup the spaces that were added when splitting the input text during the
265
+ Whether or not the model should clean up the spaces that were added when splitting the input text during the
248
266
  tokenization process.
249
267
  """
250
- if kwargs:
268
+ if kwargs and not set(kwargs.keys()).issubset(_VALID_INIT_KWARGS):
251
269
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported to init `MistralCommonBackend`.")
252
270
 
271
+ self.init_kwargs = {
272
+ "tokenizer_path": tokenizer_path,
273
+ "mode": mode,
274
+ "model_max_length": model_max_length,
275
+ "padding_side": padding_side,
276
+ "truncation_side": truncation_side,
277
+ "model_input_names": model_input_names,
278
+ "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
279
+ }
253
280
  self._tokenizer_path = Path(tokenizer_path)
254
281
  self._mode = self._get_validation_mode(mode)
282
+
255
283
  self.tokenizer: MistralTokenizer = MistralTokenizer.from_file(str(self._tokenizer_path), mode=self._mode)
256
284
  self._tokenizer_type = (
257
285
  MistralTokenizerType.tekken
258
286
  if isinstance(self.tokenizer.instruct_tokenizer.tokenizer, Tekkenizer)
259
287
  else MistralTokenizerType.spm
260
288
  )
261
- self.truncation_side = truncation_side
262
- self.padding_side = padding_side
263
- self.model_max_length = model_max_length
264
- self.cleanup_tokenization_spaces = clean_up_tokenization_spaces
265
- self.deprecation_warnings = {} # Use to store when we have already noticed a deprecation warning (avoid overlogging).
266
- self._all_special_tokens_ids = self._get_all_special_ids()
267
-
268
- if model_input_names is not None:
269
- if (
270
- not isinstance(model_input_names, (list, tuple))
271
- and len(model_input_names) == 0
272
- and not all(isinstance(i, str) for i in model_input_names)
273
- ):
274
- raise ValueError(
275
- "`model_input_names` should be a non-empty list or tuple of str but got an empty value."
276
- )
277
- self.model_input_names = model_input_names
278
-
279
289
  self._cache_get_vocab: dict[str, int] | None = None
280
290
 
281
- @staticmethod
282
- def clean_up_tokenization(text: str) -> str:
283
- """
284
- Clean up a list of simple English tokenization artifacts like spaces before punctuation.
285
- """
286
- return (
287
- text.replace(" .", ".")
288
- .replace(" ?", "?")
289
- .replace(" !", "!")
290
- .replace(" ,", ",")
291
- .replace(" ' ", "'")
292
- .replace(" n't", "n't")
293
- .replace(" 'm", "'m")
294
- .replace(" 's", "'s")
295
- .replace(" 've", "'ve")
296
- .replace(" 're", "'re")
291
+ self._all_special_ids = self._get_all_special_ids()
292
+ self._all_special_tokens = self.convert_ids_to_tokens(self.all_special_ids)
293
+
294
+ super().__init__(
295
+ truncation_side=truncation_side,
296
+ padding_side=padding_side,
297
+ model_max_length=model_max_length,
298
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
299
+ extra_special_tokens=None, # Not used by this backend.
300
+ model_specific_special_tokens=None, # Not used by this backend.
301
+ model_input_names=model_input_names or self.model_input_names,
302
+ **_MAP_SPECIAL_TOKENS,
303
+ **kwargs,
297
304
  )
298
305
 
299
306
  @property
@@ -306,75 +313,19 @@ class MistralCommonBackend(PushToHubMixin):
306
313
  """
307
314
  return self._mode
308
315
 
309
- @property
310
- def bos_token_id(self) -> int:
311
- """
312
- Id of the beginning of sentence token in the vocabulary.
313
- """
314
- return self.tokenizer.instruct_tokenizer.tokenizer.bos_id
315
-
316
- @property
317
- def eos_token_id(self) -> int:
318
- """
319
- Id of the end of sentence token in the vocabulary.
320
- """
321
- return self.tokenizer.instruct_tokenizer.tokenizer.eos_id
322
-
323
- @property
324
- def unk_token_id(self) -> int:
325
- """
326
- Id of the unknown token in the vocabulary.
327
- """
328
- return self.tokenizer.instruct_tokenizer.tokenizer.unk_id
329
-
330
- @property
331
- def pad_token_id(self) -> int:
332
- """
333
- Id of the padding token in the vocabulary.
334
- """
335
- return self.tokenizer.instruct_tokenizer.tokenizer.pad_id
336
-
337
- @property
338
- def bos_token(self) -> str:
339
- """
340
- String associated to the beginning of sentence token in the vocabulary.
341
- """
342
- return self.convert_ids_to_tokens(self.bos_token_id)
343
-
344
- @property
345
- def eos_token(self) -> str:
346
- """
347
- String associated to the end of sentence token in the vocabulary.
348
- """
349
- return self.convert_ids_to_tokens(self.eos_token_id)
350
-
351
- @property
352
- def unk_token(self) -> str:
353
- """
354
- String associated to the unknown token in the vocabulary.
355
- """
356
- return self.convert_ids_to_tokens(self.unk_token_id)
357
-
358
- @property
359
- def pad_token(self) -> str:
360
- """
361
- String associated to the padding token in the vocabulary.
362
- """
363
- return self.convert_ids_to_tokens(self.pad_token_id)
364
-
365
316
  @property
366
317
  def all_special_ids(self) -> list[int]:
367
318
  """
368
319
  `list[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.).
369
320
  """
370
- return sorted(self._all_special_tokens_ids)
321
+ return sorted(self._all_special_ids)
371
322
 
372
323
  @property
373
324
  def all_special_tokens(self) -> list[str]:
374
325
  """
375
326
  `list[str]`: A list of all unique special tokens.
376
327
  """
377
- return self.convert_ids_to_tokens(self.all_special_ids)
328
+ return self._all_special_tokens
378
329
 
379
330
  @property
380
331
  def vocab_size(self) -> int:
@@ -435,6 +386,8 @@ class MistralCommonBackend(PushToHubMixin):
435
386
  padding_side: str | None = None,
436
387
  return_tensors: str | TensorType | None = None,
437
388
  verbose: bool = True,
389
+ return_offsets_mapping: Literal[False] = False,
390
+ split_special_tokens: Literal[False] = False,
438
391
  **kwargs,
439
392
  ) -> list[int]:
440
393
  """
@@ -446,37 +399,81 @@ class MistralCommonBackend(PushToHubMixin):
446
399
  text_pair (`None`, *optional*):
447
400
  Not supported by `MistralCommonBackend.encode`. Kept to match `PreTrainedTokenizerBase.encode` signature.
448
401
  """
402
+ if return_offsets_mapping or split_special_tokens:
403
+ raise ValueError(
404
+ "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
405
+ )
406
+
407
+ if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND, "only_first", "only_second"]:
408
+ raise ValueError(
409
+ "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
410
+ )
411
+
449
412
  if kwargs:
450
413
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.encode`.")
414
+
451
415
  if text_pair:
452
416
  raise ValueError("`MistralCommonBackend.encode` does not support `text_pair`.")
453
417
 
454
- padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
418
+ return super().encode(
419
+ text=text,
420
+ text_pair=text_pair,
421
+ add_special_tokens=add_special_tokens,
455
422
  padding=padding,
456
423
  truncation=truncation,
457
424
  max_length=max_length,
458
- pad_to_multiple_of=pad_to_multiple_of,
459
- verbose=verbose,
460
- )
461
-
462
- encoded_inputs = self._encode_plus(
463
- text,
464
- add_special_tokens=add_special_tokens,
465
- padding_strategy=padding_strategy,
466
- truncation_strategy=truncation_strategy,
467
- max_length=max_length,
468
425
  stride=stride,
426
+ return_tensors=return_tensors,
469
427
  pad_to_multiple_of=pad_to_multiple_of,
470
428
  padding_side=padding_side,
471
- return_tensors=return_tensors,
472
- return_attention_mask=False,
473
- return_overflowing_tokens=False,
474
- return_special_tokens_mask=False,
475
- return_length=False,
476
429
  verbose=verbose,
477
430
  )
478
431
 
479
- return encoded_inputs["input_ids"]
432
+ def _decode(
433
+ self,
434
+ token_ids: int | list[int],
435
+ skip_special_tokens: bool = False,
436
+ clean_up_tokenization_spaces: bool | None = None,
437
+ **kwargs,
438
+ ) -> str:
439
+ if kwargs:
440
+ raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.decode`.")
441
+
442
+ token_ids = to_py_obj(token_ids)
443
+
444
+ if isinstance(token_ids, int):
445
+ token_ids = [token_ids]
446
+
447
+ special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
448
+
449
+ text = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
450
+
451
+ # Apply tokenizer-specific cleanup if available and requested
452
+ clean_up_tokenization_spaces = (
453
+ clean_up_tokenization_spaces
454
+ if clean_up_tokenization_spaces is not None
455
+ else self.clean_up_tokenization_spaces
456
+ )
457
+ if clean_up_tokenization_spaces:
458
+ # Call custom cleanup method if it exists (e.g., for CLVP's [SPACE] token replacement)
459
+ if hasattr(self, "clean_up_tokenization") and callable(self.clean_up_tokenization):
460
+ text = self.clean_up_tokenization(text)
461
+ else:
462
+ # Otherwise apply standard cleanup
463
+ text = (
464
+ text.replace(" .", ".")
465
+ .replace(" ?", "?")
466
+ .replace(" !", "!")
467
+ .replace(" ,", ",")
468
+ .replace(" ' ", "'")
469
+ .replace(" n't", "n't")
470
+ .replace(" 'm", "'m")
471
+ .replace(" 's", "'s")
472
+ .replace(" 've", "'ve")
473
+ .replace(" 're", "'re")
474
+ )
475
+
476
+ return _maybe_remove_lang(text=text, skip_special_tokens=skip_special_tokens)
480
477
 
481
478
  def decode(
482
479
  self,
@@ -484,7 +481,7 @@ class MistralCommonBackend(PushToHubMixin):
484
481
  skip_special_tokens: bool = False,
485
482
  clean_up_tokenization_spaces: bool | None = None,
486
483
  **kwargs,
487
- ) -> Union[str, list[str]]:
484
+ ) -> str | list[str]:
488
485
  """
489
486
  Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
490
487
  tokens and clean up tokenization spaces.
@@ -509,16 +506,7 @@ class MistralCommonBackend(PushToHubMixin):
509
506
  if kwargs:
510
507
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.decode`.")
511
508
 
512
- token_ids = to_py_obj(token_ids)
513
-
514
- if isinstance(token_ids, (list, tuple)) and len(token_ids) > 0 and isinstance(token_ids[0], (list, tuple)):
515
- return self._batch_decode(
516
- sequences=token_ids,
517
- skip_special_tokens=skip_special_tokens,
518
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
519
- )
520
-
521
- return self._decode(
509
+ return super().decode(
522
510
  token_ids=token_ids,
523
511
  skip_special_tokens=skip_special_tokens,
524
512
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
@@ -555,63 +543,12 @@ class MistralCommonBackend(PushToHubMixin):
555
543
  if kwargs:
556
544
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.batch_decode`.")
557
545
 
558
- return self._batch_decode(
546
+ return super().batch_decode(
559
547
  sequences=sequences,
560
548
  skip_special_tokens=skip_special_tokens,
561
549
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
562
550
  )
563
551
 
564
- def _decode(
565
- self,
566
- token_ids: Union[int, list[int], list[list[int]], np.ndarray, "torch.Tensor"],
567
- skip_special_tokens: bool = False,
568
- clean_up_tokenization_spaces: bool | None = None,
569
- ) -> str:
570
- clean_up_tokenization_spaces = clean_up_tokenization_spaces or self.cleanup_tokenization_spaces
571
-
572
- # Convert inputs to python lists
573
- if isinstance(token_ids, int):
574
- token_ids = [token_ids]
575
-
576
- token_ids = to_py_obj(token_ids)
577
-
578
- special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
579
-
580
- decoded_string = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
581
- if clean_up_tokenization_spaces:
582
- decoded_string = self.clean_up_tokenization(decoded_string)
583
-
584
- # in the specific case of Voxtral, the added f"lang:xx" (always a two char language code since it follows ISO 639-1 alpha-2 format)
585
- # is not considered as a special token by mistral-common and is encoded/ decoded as normal text.
586
- # Nevertheless we should remove it to ease users life.
587
- if skip_special_tokens:
588
- decoded_string = re.sub(r"^lang:[a-z]{2}", "", decoded_string)
589
-
590
- return decoded_string
591
-
592
- def _batch_decode(
593
- self,
594
- sequences: Union[list[int], list[list[int]], np.ndarray, "torch.Tensor"],
595
- skip_special_tokens: bool = False,
596
- clean_up_tokenization_spaces: bool | None = None,
597
- ) -> list[str]:
598
- return [
599
- self._decode(
600
- seq,
601
- skip_special_tokens=skip_special_tokens,
602
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
603
- )
604
- for seq in sequences
605
- ]
606
-
607
- def _is_control_token(self, token_id: int) -> bool:
608
- if self._tokenizer_type == MistralTokenizerType.spm:
609
- return token_id in self.tokenizer.instruct_tokenizer.tokenizer._control_tokens
610
- elif self._tokenizer_type == MistralTokenizerType.tekken:
611
- return token_id < self.tokenizer.instruct_tokenizer.tokenizer.num_special_tokens
612
- else:
613
- raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
614
-
615
552
  @overload
616
553
  def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
617
554
  @overload
@@ -632,22 +569,22 @@ class MistralCommonBackend(PushToHubMixin):
632
569
  """
633
570
 
634
571
  if isinstance(ids, int):
635
- one_token = True
572
+ return_int = True
636
573
  ids = [ids]
637
574
  else:
638
- one_token = False
575
+ return_int = False
639
576
 
640
577
  tokens: list[str] = []
641
578
  for token_id in ids:
642
- if self._is_control_token(token_id) and skip_special_tokens:
579
+ if self.tokenizer.instruct_tokenizer.tokenizer.is_special(token_id) and skip_special_tokens:
643
580
  continue
644
581
  tokens.append(self.tokenizer.instruct_tokenizer.tokenizer.id_to_piece(token_id))
645
582
 
646
- if one_token:
647
- if tokens == []:
648
- raise ValueError(f"Invalid token id {ids}.")
649
-
583
+ if return_int and tokens == []:
584
+ raise ValueError(f"Invalid token id {ids[0]}.")
585
+ elif return_int:
650
586
  return tokens[0]
587
+
651
588
  return tokens
652
589
 
653
590
  def _tekken_piece_to_id(self, piece: str, warn: bool) -> int:
@@ -708,7 +645,13 @@ class MistralCommonBackend(PushToHubMixin):
708
645
  tokens_ids = self.tokenizer.instruct_tokenizer.tokenizer.encode(text, bos=add_special_tokens, eos=add_eos)
709
646
  return tokens_ids
710
647
 
711
- def tokenize(self, text: TextInput, **kwargs) -> list[str]:
648
+ def tokenize(
649
+ self,
650
+ text: TextInput,
651
+ return_offsets_mapping: Literal[False] = False,
652
+ split_special_tokens: Literal[False] = False,
653
+ **kwargs,
654
+ ) -> list[str]:
712
655
  """
713
656
  Converts a string into a sequence of tokens, using the tokenizer.
714
657
 
@@ -717,6 +660,8 @@ class MistralCommonBackend(PushToHubMixin):
717
660
  Args:
718
661
  text (`str`):
719
662
  The sequence to be encoded.
663
+ return_offsets_mapping (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
664
+ split_special_tokens (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
720
665
  **kwargs (additional keyword arguments):
721
666
  Not supported by `MistralCommonBackend.tokenize`.
722
667
  Will raise an error if used.
@@ -724,40 +669,164 @@ class MistralCommonBackend(PushToHubMixin):
724
669
  Returns:
725
670
  `list[str]`: The list of tokens.
726
671
  """
672
+ if return_offsets_mapping or split_special_tokens:
673
+ raise ValueError(
674
+ "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
675
+ )
676
+
727
677
  if kwargs:
728
678
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.tokenize`.")
729
679
 
730
680
  return self.convert_ids_to_tokens(self._text_to_ids(text, add_special_tokens=False), skip_special_tokens=False)
731
681
 
732
- def _encode_plus(
682
+ def _get_all_special_ids(self) -> set[int]:
683
+ if self._tokenizer_type == MistralTokenizerType.tekken:
684
+ return self.tokenizer.instruct_tokenizer.tokenizer._special_token_ids
685
+ elif self._tokenizer_type == MistralTokenizerType.spm:
686
+ return {
687
+ token_id
688
+ for token_id in range(self.tokenizer.instruct_tokenizer.tokenizer.n_words)
689
+ if self.tokenizer.instruct_tokenizer.tokenizer.is_special(token_id)
690
+ }
691
+ else:
692
+ raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
693
+
694
+ def get_special_tokens_mask(
695
+ self, token_ids_0: list[int], token_ids_1: None = None, already_has_special_tokens: bool = False
696
+ ) -> list[int]:
697
+ """
698
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
699
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
700
+
701
+ Args:
702
+ token_ids_0 (`list[int]`): List of ids of the sequence.
703
+ token_ids_1 (`None`, *optional*): None, kept to match Transformers' implementation.
704
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
705
+ Whether or not the token list is already formatted with special tokens for the model.
706
+
707
+ Returns:
708
+ A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
709
+ """
710
+ if token_ids_1 is not None:
711
+ raise ValueError(
712
+ "`token_ids_1` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
713
+ )
714
+
715
+ if already_has_special_tokens:
716
+ return [1 if int(token_id) in self._all_special_ids else 0 for token_id in token_ids_0]
717
+
718
+ if self.mode == ValidationMode.test:
719
+ # [BOS] seq0
720
+ return [1] + ([0] * len(token_ids_0))
721
+ else:
722
+ # [BOS] seq0 [EOS]
723
+ return [1] + ([0] * len(token_ids_0)) + [1]
724
+
725
+ def _encode_plus( # type: ignore[override]
733
726
  self,
734
- text: TextInput | EncodedInput,
727
+ text: TextInput | PreTokenizedInput | EncodedInput,
728
+ text_pair: None = None,
735
729
  add_special_tokens: bool = True,
736
730
  padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
737
731
  truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
738
732
  max_length: int | None = None,
739
733
  stride: int = 0,
734
+ is_split_into_words: bool = False,
740
735
  pad_to_multiple_of: int | None = None,
741
736
  padding_side: str | None = None,
742
737
  return_tensors: str | TensorType | None = None,
738
+ return_token_type_ids: bool | None = None,
743
739
  return_attention_mask: bool | None = None,
744
740
  return_overflowing_tokens: bool = False,
745
741
  return_special_tokens_mask: bool = False,
746
742
  return_length: bool = False,
747
743
  verbose: bool = True,
744
+ return_offsets_mapping: Literal[False] = False,
745
+ split_special_tokens: Literal[False] = False,
746
+ **kwargs,
748
747
  ) -> BatchEncoding:
748
+ # Detect batched inputs (list of sequences)
749
+ if text_pair is not None:
750
+ raise ValueError("`MistralCommonBackend` does not support `text_pair != None` for `_encode_plus`.")
751
+
752
+ if return_offsets_mapping or split_special_tokens:
753
+ raise ValueError(
754
+ "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
755
+ )
756
+
757
+ if kwargs:
758
+ raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend._encode_plus`.")
759
+
760
+ is_batched = isinstance(text, (list, tuple)) and (
761
+ (not text and not is_split_into_words)
762
+ or (text and is_split_into_words and isinstance(text[0], (list, tuple)))
763
+ or (text and not is_split_into_words and isinstance(text[0], (str, list, tuple)))
764
+ )
765
+
766
+ if is_batched:
767
+ batch_outputs = {}
768
+ one_overflowed = False
769
+ for current_text in text:
770
+ current_output = self._encode_plus(
771
+ text=current_text,
772
+ text_pair=None,
773
+ add_special_tokens=add_special_tokens,
774
+ padding_strategy=PaddingStrategy.DO_NOT_PAD, # we pad in batch afterward
775
+ truncation_strategy=truncation_strategy,
776
+ max_length=max_length,
777
+ stride=stride,
778
+ is_split_into_words=is_split_into_words,
779
+ pad_to_multiple_of=None, # we pad in batch afterward
780
+ padding_side=None, # we pad in batch afterward
781
+ return_tensors=None, # We convert the whole batch to tensors at the end
782
+ return_token_type_ids=return_token_type_ids,
783
+ return_attention_mask=False, # we pad in batch afterward
784
+ return_overflowing_tokens=return_overflowing_tokens,
785
+ return_special_tokens_mask=return_special_tokens_mask,
786
+ return_length=return_length,
787
+ verbose=verbose,
788
+ )
789
+ for key, value in current_output.items():
790
+ batch_outputs.setdefault(key, []).append(value)
791
+
792
+ # To ensure the list is built for each sample, we need to add this.
793
+ if return_overflowing_tokens and not return_tensors:
794
+ if "overflowing_tokens" not in current_output:
795
+ batch_outputs.setdefault("overflowing_tokens", []).append([0])
796
+ batch_outputs.setdefault("num_truncated_tokens", []).append([0])
797
+ else:
798
+ one_overflowed = True
799
+
800
+ # Remove overflow-related keys before tensor conversion if return_tensors is set
801
+ # Slow tokenizers don't support returning these as tensors
802
+ if return_overflowing_tokens and (return_tensors or not one_overflowed):
803
+ batch_outputs.pop("overflowing_tokens", None)
804
+ batch_outputs.pop("num_truncated_tokens", None)
805
+
806
+ batch_outputs = self.pad(
807
+ batch_outputs,
808
+ padding=padding_strategy.value,
809
+ max_length=max_length,
810
+ pad_to_multiple_of=pad_to_multiple_of,
811
+ padding_side=padding_side,
812
+ return_attention_mask=return_attention_mask,
813
+ )
814
+
815
+ return BatchEncoding(batch_outputs, tensor_type=return_tensors)
816
+
749
817
  def get_input_ids(text):
750
818
  if isinstance(text, str):
751
- return self._text_to_ids(text, add_special_tokens)
819
+ return self._text_to_ids(text, False)
752
820
  elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
753
821
  return text
754
822
  else:
755
823
  raise ValueError(f"Input {text} is not valid. Should be a string, or a list/tuple of integers.")
756
824
 
757
- ids = get_input_ids(text)
825
+ first_ids = get_input_ids(text)
758
826
 
759
827
  return self.prepare_for_model(
760
- ids,
828
+ first_ids,
829
+ pair_ids=None,
761
830
  add_special_tokens=add_special_tokens,
762
831
  padding=padding_strategy.value,
763
832
  truncation=truncation_strategy.value,
@@ -768,202 +837,62 @@ class MistralCommonBackend(PushToHubMixin):
768
837
  return_tensors=return_tensors,
769
838
  prepend_batch_axis=True,
770
839
  return_attention_mask=return_attention_mask,
840
+ return_token_type_ids=return_token_type_ids,
771
841
  return_overflowing_tokens=return_overflowing_tokens,
772
842
  return_special_tokens_mask=return_special_tokens_mask,
773
843
  return_length=return_length,
774
844
  verbose=verbose,
775
845
  )
776
846
 
777
- def _batch_encode_plus(
847
+ @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
848
+ def prepare_for_model(
778
849
  self,
779
- batch_text: list[TextInput] | list[EncodedInput],
850
+ ids: list[int],
851
+ pair_ids: None = None,
780
852
  add_special_tokens: bool = True,
781
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
782
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
853
+ padding: bool | str | PaddingStrategy = False,
854
+ truncation: bool | str | TruncationStrategy | None = None,
783
855
  max_length: int | None = None,
784
856
  stride: int = 0,
785
857
  pad_to_multiple_of: int | None = None,
786
858
  padding_side: str | None = None,
787
859
  return_tensors: str | TensorType | None = None,
860
+ return_token_type_ids: bool | None = None,
788
861
  return_attention_mask: bool | None = None,
789
862
  return_overflowing_tokens: bool = False,
790
863
  return_special_tokens_mask: bool = False,
791
864
  return_length: bool = False,
792
865
  verbose: bool = True,
866
+ prepend_batch_axis: bool = False,
867
+ return_offsets_mapping: Literal[False] = False,
868
+ split_special_tokens: Literal[False] = False,
869
+ **kwargs,
793
870
  ) -> BatchEncoding:
794
- def get_input_ids(text):
795
- if isinstance(text, str):
796
- return self._text_to_ids(text, add_special_tokens)
797
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
798
- return text
799
- else:
800
- raise ValueError("Input is not valid. Should be a string or a list/tuple of integers.")
801
-
802
- input_ids = []
803
- for ids in batch_text:
804
- input_ids.append(get_input_ids(ids))
805
-
806
- batch_outputs = self._batch_prepare_for_model(
807
- input_ids,
808
- add_special_tokens=add_special_tokens,
809
- padding_strategy=padding_strategy,
810
- truncation_strategy=truncation_strategy,
811
- max_length=max_length,
812
- stride=stride,
813
- pad_to_multiple_of=pad_to_multiple_of,
814
- padding_side=padding_side,
815
- return_attention_mask=return_attention_mask,
816
- return_overflowing_tokens=return_overflowing_tokens,
817
- return_special_tokens_mask=return_special_tokens_mask,
818
- return_length=return_length,
819
- return_tensors=return_tensors,
820
- verbose=verbose,
821
- )
822
-
823
- return BatchEncoding(batch_outputs)
824
-
825
- def _get_all_special_ids(self) -> set[int]:
826
- if self._tokenizer_type == MistralTokenizerType.tekken:
827
- return {t["rank"] for t in self.tokenizer.instruct_tokenizer.tokenizer._all_special_tokens}
828
- elif self._tokenizer_type == MistralTokenizerType.spm:
829
- return self.tokenizer.instruct_tokenizer.tokenizer._control_tokens
830
- else:
831
- raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
832
-
833
- def get_special_tokens_mask(
834
- self, token_ids_0: list, token_ids_1: None = None, already_has_special_tokens: bool = False
835
- ) -> list[int]:
836
871
  """
837
- Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
838
- special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
872
+ Prepares a sequence of input id so that it can be used by the model. It
873
+ adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
874
+ manages a moving window (with user defined stride) for overflowing tokens.
839
875
 
840
876
  Args:
841
- token_ids_0 (`list[int]`):
842
- List of ids of the sequence.
843
- token_ids_1 (`list[int]`, *optional*):
877
+ ids (`list[int]`):
878
+ Tokenized input ids of the first sequence.
879
+ pair_ids (`None`, *optional*):
844
880
  Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
845
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
846
- Whether or not the token list is already formatted with special tokens for the model.
847
-
848
- Returns:
849
- A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
850
881
  """
851
- if token_ids_1 is not None:
882
+ if return_offsets_mapping or split_special_tokens:
852
883
  raise ValueError(
853
- "`token_ids_1` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
884
+ "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
854
885
  )
855
- if already_has_special_tokens:
886
+
887
+ if pair_ids is not None:
856
888
  raise ValueError(
857
- "`already_has_special_tokens` is not supported by `MistralCommonBackend` and should be `False`."
889
+ "`pair_ids` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
858
890
  )
859
891
 
860
- special_tokens_mask = [1 if token in self._all_special_tokens_ids else 0 for token in token_ids_0]
861
- return special_tokens_mask
862
-
863
- def _batch_prepare_for_model(
864
- self,
865
- batch_ids: list[PreTokenizedInput | list[int]],
866
- add_special_tokens: bool = True,
867
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
868
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
869
- max_length: int | None = None,
870
- stride: int = 0,
871
- pad_to_multiple_of: int | None = None,
872
- padding_side: str | None = None,
873
- return_tensors: str | None = None,
874
- return_attention_mask: bool | None = None,
875
- return_overflowing_tokens: bool = False,
876
- return_special_tokens_mask: bool = False,
877
- return_length: bool = False,
878
- verbose: bool = True,
879
- ) -> BatchEncoding:
880
- """
881
- Prepares a sequence of input id so that it can be used by the model. It
882
- adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
883
- manages a moving window (with user defined stride) for overflowing tokens.
884
-
885
- Args:
886
- batch_ids: list of tokenized input ids
887
- """
888
-
889
- batch_outputs = {}
890
- for ids in batch_ids:
891
- outputs = self.prepare_for_model(
892
- ids,
893
- add_special_tokens=add_special_tokens,
894
- padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
895
- truncation=truncation_strategy.value,
896
- max_length=max_length,
897
- stride=stride,
898
- pad_to_multiple_of=None, # we pad in batch afterward
899
- padding_side=None, # we pad in batch afterward
900
- return_attention_mask=False, # we pad in batch afterward
901
- return_overflowing_tokens=return_overflowing_tokens,
902
- return_special_tokens_mask=return_special_tokens_mask,
903
- return_length=return_length,
904
- return_tensors=None, # We convert the whole batch to tensors at the end
905
- prepend_batch_axis=False,
906
- verbose=verbose,
907
- )
908
-
909
- for key, value in outputs.items():
910
- if key not in batch_outputs:
911
- batch_outputs[key] = []
912
- batch_outputs[key].append(value)
913
-
914
- batch_outputs = self.pad(
915
- batch_outputs,
916
- padding=padding_strategy.value,
917
- max_length=max_length,
918
- pad_to_multiple_of=pad_to_multiple_of,
919
- padding_side=padding_side,
920
- return_attention_mask=return_attention_mask,
921
- )
922
-
923
- batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
924
-
925
- return batch_outputs
926
-
927
- @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
928
- def prepare_for_model(
929
- self,
930
- ids: list[int],
931
- pair_ids: None = None,
932
- add_special_tokens: bool = True,
933
- padding: bool | str | PaddingStrategy = False,
934
- truncation: bool | str | TruncationStrategy | None = None,
935
- max_length: int | None = None,
936
- stride: int = 0,
937
- pad_to_multiple_of: int | None = None,
938
- padding_side: str | None = None,
939
- return_tensors: str | TensorType | None = None,
940
- return_attention_mask: bool | None = None,
941
- return_overflowing_tokens: bool = False,
942
- return_special_tokens_mask: bool = False,
943
- return_length: bool = False,
944
- verbose: bool = True,
945
- prepend_batch_axis: bool = False,
946
- **kwargs,
947
- ) -> BatchEncoding:
948
- """
949
- Prepares a sequence of input id so that it can be used by the model. It
950
- adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
951
- manages a moving window (with user defined stride) for overflowing tokens.
952
-
953
- Args:
954
- ids (`list[int]`):
955
- Tokenized input ids of the first sequence.
956
- pair_ids (`None`, *optional*):
957
- Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
958
- """
959
- if pair_ids is not None:
960
- raise ValueError(
961
- "`pair_ids` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
962
- )
963
- if kwargs:
964
- raise ValueError(
965
- f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.prepare_for_model`."
966
- )
892
+ if kwargs:
893
+ raise ValueError(
894
+ f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.prepare_for_model`."
895
+ )
967
896
 
968
897
  padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
969
898
  padding=padding,
@@ -971,39 +900,65 @@ class MistralCommonBackend(PushToHubMixin):
971
900
  max_length=max_length,
972
901
  pad_to_multiple_of=pad_to_multiple_of,
973
902
  verbose=verbose,
903
+ **kwargs,
974
904
  )
975
905
 
976
- len_ids = len(ids)
906
+ # Validation
907
+ if (
908
+ return_overflowing_tokens
909
+ and truncation_strategy == TruncationStrategy.LONGEST_FIRST
910
+ and pair_ids is not None
911
+ ):
912
+ raise ValueError(
913
+ "Not possible to return overflowing tokens for pair of sequences with the "
914
+ "`longest_first`. Please select another truncation strategy than `longest_first`, "
915
+ "for instance `only_second` or `only_first`."
916
+ )
977
917
 
978
- # Load from model defaults
918
+ # Defaults
919
+ if return_token_type_ids is None:
920
+ return_token_type_ids = "token_type_ids" in self.model_input_names
979
921
  if return_attention_mask is None:
980
922
  return_attention_mask = "attention_mask" in self.model_input_names
981
923
 
982
- encoded_inputs = {}
924
+ # Truncation
925
+ num_special = self.num_special_tokens_to_add(pair=False) if add_special_tokens else 0
926
+ total_len = len(ids) + len(pair_ids or []) + num_special
983
927
 
984
- # Truncation: Handle max sequence length
985
928
  overflowing_tokens = []
986
- if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and len_ids > max_length:
929
+ if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
987
930
  ids, _, overflowing_tokens = self.truncate_sequences(
988
931
  ids,
989
- num_tokens_to_remove=len_ids - max_length,
932
+ pair_ids=None,
933
+ num_tokens_to_remove=total_len - max_length,
990
934
  truncation_strategy=truncation_strategy,
991
935
  stride=stride,
992
936
  )
993
937
 
994
- if return_overflowing_tokens:
995
- encoded_inputs["overflowing_tokens"] = overflowing_tokens
996
- encoded_inputs["num_truncated_tokens"] = len_ids - max_length
938
+ # Add special tokens
939
+ if add_special_tokens:
940
+ sequence = self.build_inputs_with_special_tokens(ids, None)
941
+ token_type_ids = self.create_token_type_ids_from_sequences(ids, None)
942
+ else:
943
+ sequence = ids
944
+ token_type_ids = [0] * len(sequence)
997
945
 
998
- # Build output dictionary
999
- encoded_inputs[self.model_input_names[0]] = ids
946
+ # Build output
947
+ encoded_inputs = {"input_ids": sequence}
948
+ if return_token_type_ids:
949
+ encoded_inputs["token_type_ids"] = token_type_ids
1000
950
  if return_special_tokens_mask:
1001
- if add_special_tokens:
1002
- encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, None)
1003
- else:
1004
- encoded_inputs["special_tokens_mask"] = [0] * len(ids)
951
+ encoded_inputs["special_tokens_mask"] = (
952
+ self.get_special_tokens_mask(ids, None) if add_special_tokens else [0] * len(sequence)
953
+ )
954
+ if return_overflowing_tokens and not return_tensors and overflowing_tokens:
955
+ encoded_inputs["overflowing_tokens"] = overflowing_tokens
956
+ encoded_inputs["num_truncated_tokens"] = total_len - max_length if max_length else 0
957
+
958
+ # Check sequence length and warn if needed
959
+ self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
1005
960
 
1006
- # Padding
961
+ # Pad
1007
962
  if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
1008
963
  encoded_inputs = self.pad(
1009
964
  encoded_inputs,
@@ -1017,362 +972,9 @@ class MistralCommonBackend(PushToHubMixin):
1017
972
  if return_length:
1018
973
  encoded_inputs["length"] = len(encoded_inputs["input_ids"])
1019
974
 
1020
- batch_outputs = BatchEncoding(
1021
- encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
1022
- )
1023
-
1024
- return batch_outputs
1025
-
1026
- def _get_padding_truncation_strategies(
1027
- self,
1028
- padding: str | PaddingStrategy | bool = False,
1029
- truncation: str | TruncationStrategy | bool | None = None,
1030
- max_length: int | None = None,
1031
- pad_to_multiple_of: int | None = None,
1032
- verbose: bool = True,
1033
- **kwargs,
1034
- ):
1035
- """
1036
- Find the correct padding/truncation strategy.
1037
- """
1038
-
1039
- # Backward compatibility for previous behavior, maybe we should deprecate it:
1040
- # If you only set max_length, it activates truncation for max_length
1041
- if max_length is not None and padding is False and truncation is None:
1042
- if verbose:
1043
- if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
1044
- logger.warning(
1045
- "Truncation was not explicitly activated but `max_length` is provided a specific value, please"
1046
- " use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
1047
- " 'longest_first' truncation strategy."
1048
- )
1049
- self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
1050
- truncation = "longest_first"
1051
-
1052
- # Get padding strategy
1053
- if padding is not False:
1054
- if padding is True:
1055
- if verbose:
1056
- if max_length is not None and (
1057
- truncation is None or truncation is False or truncation == "do_not_truncate"
1058
- ):
1059
- warnings.warn(
1060
- "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
1061
- "To pad to max length, use `padding='max_length'`."
1062
- )
1063
- padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
1064
- elif not isinstance(padding, PaddingStrategy):
1065
- padding_strategy = PaddingStrategy(padding)
1066
- elif isinstance(padding, PaddingStrategy):
1067
- padding_strategy = padding
1068
- else:
1069
- padding_strategy = PaddingStrategy.DO_NOT_PAD
1070
-
1071
- # Get truncation strategy
1072
- if truncation is not False and truncation is not None:
1073
- if truncation is True:
1074
- truncation_strategy = (
1075
- TruncationStrategy.LONGEST_FIRST
1076
- ) # Default to truncate the longest sequences in pairs of inputs
1077
- elif not isinstance(truncation, TruncationStrategy):
1078
- truncation_strategy = TruncationStrategy(truncation)
1079
- elif isinstance(truncation, TruncationStrategy):
1080
- truncation_strategy = truncation
1081
- if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]:
1082
- raise ValueError(
1083
- "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
1084
- )
1085
- else:
1086
- truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
1087
-
1088
- # Set max length if needed
1089
- if max_length is None:
1090
- if padding_strategy == PaddingStrategy.MAX_LENGTH:
1091
- if self.model_max_length > LARGE_INTEGER:
1092
- if verbose:
1093
- if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
1094
- logger.warning(
1095
- "Asking to pad to max_length but no maximum length is provided and the model has no"
1096
- " predefined maximum length. Default to no padding."
1097
- )
1098
- self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
1099
- padding_strategy = PaddingStrategy.DO_NOT_PAD
1100
- else:
1101
- max_length = self.model_max_length
1102
-
1103
- if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
1104
- if self.model_max_length > LARGE_INTEGER:
1105
- if verbose:
1106
- if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
1107
- logger.warning(
1108
- "Asking to truncate to max_length but no maximum length is provided and the model has"
1109
- " no predefined maximum length. Default to no truncation."
1110
- )
1111
- self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
1112
- truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
1113
- else:
1114
- max_length = self.model_max_length
1115
-
1116
- # Test if we have a padding token
1117
- if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token_id is None or self.pad_token_id < 0):
1118
- raise ValueError(
1119
- "Asking to pad but the tokenizer does not have a padding token. "
1120
- "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
1121
- "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
1122
- )
1123
-
1124
- # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
1125
- if (
1126
- truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
1127
- and padding_strategy != PaddingStrategy.DO_NOT_PAD
1128
- and pad_to_multiple_of is not None
1129
- and max_length is not None
1130
- and (max_length % pad_to_multiple_of != 0)
1131
- ):
1132
- raise ValueError(
1133
- "Truncation and padding are both activated but "
1134
- f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
1135
- )
1136
-
1137
- return padding_strategy, truncation_strategy, max_length, kwargs
1138
-
1139
- def _pad(
1140
- self,
1141
- encoded_inputs: dict[str, EncodedInput] | BatchEncoding,
1142
- max_length: int | None = None,
1143
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
1144
- pad_to_multiple_of: int | None = None,
1145
- padding_side: str | None = None,
1146
- return_attention_mask: bool | None = None,
1147
- ) -> dict:
1148
- """
1149
- Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
1150
-
1151
- Args:
1152
- encoded_inputs:
1153
- Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
1154
- max_length: maximum length of the returned list and optionally padding length (see below).
1155
- Will truncate by taking into account the special tokens.
1156
- padding_strategy: PaddingStrategy to use for padding.
1157
-
1158
- - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
1159
- - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
1160
- - PaddingStrategy.DO_NOT_PAD: Do not pad
1161
- The tokenizer padding sides are defined in `padding_side` argument:
1162
-
1163
- - 'left': pads on the left of the sequences
1164
- - 'right': pads on the right of the sequences
1165
- pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
1166
- This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
1167
- `>= 7.5` (Volta).
1168
- padding_side:
1169
- The side on which the model should have padding applied. Should be selected between ['right', 'left'].
1170
- Default value is picked from the class attribute of the same name.
1171
- return_attention_mask:
1172
- (optional) Set to False to avoid returning attention mask (default: set to model specifics)
1173
- """
1174
- # Load from model defaults
1175
- if return_attention_mask is None:
1176
- return_attention_mask = "attention_mask" in self.model_input_names
1177
-
1178
- required_input = encoded_inputs[self.model_input_names[0]]
1179
-
1180
- if padding_strategy == PaddingStrategy.LONGEST:
1181
- max_length = len(required_input)
1182
-
1183
- if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
1184
- max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
1185
-
1186
- needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
1187
-
1188
- # Initialize attention mask if not present.
1189
- if return_attention_mask and "attention_mask" not in encoded_inputs:
1190
- encoded_inputs["attention_mask"] = [1] * len(required_input)
1191
-
1192
- if needs_to_be_padded:
1193
- difference = max_length - len(required_input)
1194
- padding_side = padding_side if padding_side is not None else self.padding_side
1195
-
1196
- if padding_side == "right":
1197
- if return_attention_mask:
1198
- encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
1199
- if "special_tokens_mask" in encoded_inputs:
1200
- encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
1201
- encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
1202
- elif padding_side == "left":
1203
- if return_attention_mask:
1204
- encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
1205
- if "special_tokens_mask" in encoded_inputs:
1206
- encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
1207
- encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
1208
- else:
1209
- raise ValueError(f"Invalid padding strategy:{padding_side}")
1210
-
1211
- return encoded_inputs
975
+ return BatchEncoding(encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis)
1212
976
 
1213
- def pad(
1214
- self,
1215
- encoded_inputs: BatchEncoding
1216
- | list[BatchEncoding]
1217
- | dict[str, EncodedInput]
1218
- | dict[str, list[EncodedInput]]
1219
- | list[dict[str, EncodedInput]],
1220
- padding: bool | str | PaddingStrategy = True,
1221
- max_length: int | None = None,
1222
- pad_to_multiple_of: int | None = None,
1223
- padding_side: str | None = None,
1224
- return_attention_mask: bool | None = None,
1225
- return_tensors: str | TensorType | None = None,
1226
- verbose: bool = True,
1227
- ) -> BatchEncoding:
1228
- """
1229
- Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
1230
- in the batch.
1231
-
1232
- Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
1233
- `self.pad_token_id`).
1234
- <Tip>
1235
-
1236
- If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors, the
1237
- result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
1238
- PyTorch tensors, you will lose the specific device of your tensors however.
1239
-
1240
- </Tip>
1241
-
1242
- Args:
1243
- encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, list[int]]`, `Dict[str, list[list[int]]` or `List[Dict[str, list[int]]]`):
1244
- Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, list[int]]`) or a batch of
1245
- tokenized inputs (list of [`BatchEncoding`], *Dict[str, list[list[int]]]* or *List[Dict[str,
1246
- list[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
1247
- collate function.
1248
-
1249
- Instead of `list[int]` you can have tensors (numpy arrays, PyTorch tensors), see
1250
- the note above for the return type.
1251
- padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
1252
- Select a strategy to pad the returned sequences (according to the model's padding side and padding
1253
- index) among:
1254
-
1255
- - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
1256
- sequence if provided).
1257
- - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
1258
- acceptable input length for the model if that argument is not provided.
1259
- - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different
1260
- lengths).
1261
- max_length (`int`, *optional*):
1262
- Maximum length of the returned list and optionally padding length (see above).
1263
- pad_to_multiple_of (`int`, *optional*):
1264
- If set will pad the sequence to a multiple of the provided value.
1265
-
1266
- This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
1267
- `>= 7.5` (Volta).
1268
- padding_side (`str`, *optional*):
1269
- The side on which the model should have padding applied. Should be selected between ['right', 'left'].
1270
- Default value is picked from the class attribute of the same name.
1271
- return_attention_mask (`bool`, *optional*):
1272
- Whether to return the attention mask. If left to the default, will return the attention mask according
1273
- to the specific tokenizer's default, defined by the `return_outputs` attribute.
1274
-
1275
- [What are attention masks?](../glossary#attention-mask)
1276
- return_tensors (`str` or [`~utils.TensorType`], *optional*):
1277
- If set, will return tensors instead of list of python integers. Acceptable values are:
1278
-
1279
- - `'pt'`: Return PyTorch `torch.Tensor` objects.
1280
- - `'np'`: Return Numpy `np.ndarray` objects.
1281
- verbose (`bool`, *optional*, defaults to `True`):
1282
- Whether or not to print more information and warnings.
1283
- """
1284
- # If we have a list of dicts, let's convert it in a dict of lists
1285
- # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
1286
- if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
1287
- # Call .keys() explicitly for compatibility with TensorDict and other Mapping subclasses
1288
- encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
1289
-
1290
- # The model's main input name, usually `input_ids`, has been passed for padding
1291
- if self.model_input_names[0] not in encoded_inputs:
1292
- raise ValueError(
1293
- "You should supply an encoding or a list of encodings to this method "
1294
- f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
1295
- )
1296
-
1297
- required_input = encoded_inputs[self.model_input_names[0]]
1298
-
1299
- if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
1300
- if return_attention_mask:
1301
- encoded_inputs["attention_mask"] = []
1302
- return encoded_inputs
1303
-
1304
- # If we have PyTorch/NumPy tensors/arrays as inputs, we cast them as python objects
1305
- # and rebuild them afterwards if no return_tensors is specified
1306
- # Note that we lose the specific device the tensor may be on for PyTorch
1307
-
1308
- first_element = required_input[0]
1309
- if isinstance(first_element, (list, tuple)):
1310
- # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
1311
- for item in required_input:
1312
- if len(item) != 0:
1313
- first_element = item[0]
1314
- break
1315
- # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
1316
- if not isinstance(first_element, (int, list, tuple)):
1317
- if is_torch_tensor(first_element):
1318
- return_tensors = "pt" if return_tensors is None else return_tensors
1319
- elif isinstance(first_element, np.ndarray):
1320
- return_tensors = "np" if return_tensors is None else return_tensors
1321
- else:
1322
- raise ValueError(
1323
- f"type of {first_element} unknown: {type(first_element)}. "
1324
- "Should be one of a python, numpy, or pytorch object."
1325
- )
1326
-
1327
- for key, value in encoded_inputs.items():
1328
- encoded_inputs[key] = to_py_obj(value)
1329
-
1330
- # Convert padding_strategy in PaddingStrategy
1331
- padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
1332
- padding=padding, max_length=max_length, verbose=verbose
1333
- )
1334
-
1335
- required_input = encoded_inputs[self.model_input_names[0]]
1336
- if required_input and not isinstance(required_input[0], (list, tuple)):
1337
- encoded_inputs = self._pad(
1338
- encoded_inputs,
1339
- max_length=max_length,
1340
- padding_strategy=padding_strategy,
1341
- pad_to_multiple_of=pad_to_multiple_of,
1342
- padding_side=padding_side,
1343
- return_attention_mask=return_attention_mask,
1344
- )
1345
- return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
1346
-
1347
- batch_size = len(required_input)
1348
- assert all(len(v) == batch_size for v in encoded_inputs.values()), (
1349
- "Some items in the output dictionary have a different batch size than others."
1350
- )
1351
-
1352
- if padding_strategy == PaddingStrategy.LONGEST:
1353
- max_length = max(len(inputs) for inputs in required_input)
1354
- padding_strategy = PaddingStrategy.MAX_LENGTH
1355
-
1356
- batch_outputs = {}
1357
- for i in range(batch_size):
1358
- inputs = {k: v[i] for k, v in encoded_inputs.items()}
1359
- outputs = self._pad(
1360
- inputs,
1361
- max_length=max_length,
1362
- padding_strategy=padding_strategy,
1363
- pad_to_multiple_of=pad_to_multiple_of,
1364
- padding_side=padding_side,
1365
- return_attention_mask=return_attention_mask,
1366
- )
1367
-
1368
- for key, value in outputs.items():
1369
- if key not in batch_outputs:
1370
- batch_outputs[key] = []
1371
- batch_outputs[key].append(value)
1372
-
1373
- return BatchEncoding(batch_outputs, tensor_type=return_tensors)
1374
-
1375
- def truncate_sequences(
977
+ def truncate_sequences( # type: ignore[override]
1376
978
  self,
1377
979
  ids: list[int],
1378
980
  pair_ids: None = None,
@@ -1407,47 +1009,36 @@ class MistralCommonBackend(PushToHubMixin):
1407
1009
  `Tuple[list[int], None, list[int]]`: The truncated `ids` and the list of
1408
1010
  overflowing tokens. `None` is returned to match Transformers signature.
1409
1011
  """
1410
- if kwargs:
1411
- raise ValueError(
1412
- f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.truncate_sequences`."
1413
- )
1012
+
1414
1013
  if pair_ids:
1415
1014
  raise ValueError("`pair_ids` is not supported by `MistralCommonBackend.truncate_sequences`.")
1416
1015
 
1417
- if num_tokens_to_remove <= 0:
1418
- return (ids, None, [])
1419
-
1420
1016
  if not isinstance(truncation_strategy, TruncationStrategy):
1421
1017
  truncation_strategy = TruncationStrategy(truncation_strategy)
1422
1018
 
1423
- if truncation_strategy in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]:
1424
- raise ValueError(
1425
- f"Only {TruncationStrategy.LONGEST_FIRST} and {TruncationStrategy.DO_NOT_TRUNCATE} are supported."
1426
- )
1019
+ if truncation_strategy in [
1020
+ TruncationStrategy.ONLY_FIRST,
1021
+ TruncationStrategy.ONLY_SECOND,
1022
+ ]:
1023
+ raise ValueError(f"{truncation_strategy=} is not supported by `MistralCommonBackend`.")
1024
+
1025
+ if num_tokens_to_remove <= 0:
1026
+ return ids, None, []
1427
1027
 
1428
1028
  overflowing_tokens = []
1429
- if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
1430
- if len(ids) > num_tokens_to_remove:
1431
- window_len = min(len(ids), stride + num_tokens_to_remove)
1432
- if self.truncation_side == "left":
1433
- overflowing_tokens = ids[:window_len]
1434
- ids = ids[num_tokens_to_remove:]
1435
- elif self.truncation_side == "right":
1436
- overflowing_tokens = ids[-window_len:]
1437
- ids = ids[:-num_tokens_to_remove]
1438
- else:
1439
- raise ValueError(f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.")
1440
1029
 
1030
+ if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
1031
+ window_len = min(len(ids), stride + num_tokens_to_remove)
1032
+ if self.truncation_side == "left":
1033
+ overflowing_tokens = ids[:window_len]
1034
+ ids = ids[num_tokens_to_remove:]
1441
1035
  else:
1442
- error_msg = (
1443
- f"We need to remove {num_tokens_to_remove} to truncate the input "
1444
- f"but the first sequence has a length {len(ids)}. "
1445
- )
1446
- logger.error(error_msg)
1036
+ overflowing_tokens = ids[-window_len:]
1037
+ ids = ids[:-num_tokens_to_remove]
1447
1038
 
1448
- return (ids, None, overflowing_tokens)
1039
+ return ids, None, overflowing_tokens
1449
1040
 
1450
- def apply_chat_template(
1041
+ def apply_chat_template( # type: ignore[override]
1451
1042
  self,
1452
1043
  conversation: list[dict[str, str]] | list[list[dict[str, str]]],
1453
1044
  tools: list[dict | Callable] | None = None,
@@ -1475,8 +1066,8 @@ class MistralCommonBackend(PushToHubMixin):
1475
1066
  [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
1476
1067
  for more information.
1477
1068
  add_generation_prompt (`bool`, *optional*):
1478
- This argument is a no-op for `MistralCommonBackend`. However it cannot be used at the same time as `continue_final_message` to keep the API consistent and
1479
- if any conversation ends with an assistant message, it will raise an error. In such case, use `continue_final_message` instead.
1069
+ This argument is a no-op for `MistralCommonBackend`. However, it cannot be used at the same time as `continue_final_message` to keep the API consistent.
1070
+ If any conversation ends with an assistant message, it will raise an error. In such cases, use `continue_final_message` instead.
1480
1071
  continue_final_message (bool, *optional*):
1481
1072
  If this is set, the chat will be formatted so that the final
1482
1073
  message in the chat is open-ended, without any EOS tokens. The model will continue this message
@@ -1511,8 +1102,7 @@ class MistralCommonBackend(PushToHubMixin):
1511
1102
  Will raise an error if used.
1512
1103
 
1513
1104
  Returns:
1514
- `Union[str, list[int], list[str], list[list[int]], BatchEncoding]`: A list of token ids representing the tokenized chat so far, including control
1515
- tokens. This output is ready to pass to the model, either directly or via methods like `generate()`.
1105
+ `Union[str, list[int], list[str], list[list[int]], BatchEncoding]`: The tokenized chat so far, including control tokens. This output is ready to pass to the model, either directly or via methods like `generate()`.
1516
1106
  """
1517
1107
  if kwargs:
1518
1108
  raise ValueError(
@@ -1659,6 +1249,83 @@ class MistralCommonBackend(PushToHubMixin):
1659
1249
  )
1660
1250
  return outputs
1661
1251
 
1252
+ def build_inputs_with_special_tokens(self, token_ids_0: list[int], token_ids_1: None = None) -> list[int]:
1253
+ """
1254
+ Build model inputs from a sequence by adding special tokens.
1255
+
1256
+ This method dynamically builds inputs based on the tokenizer's `mode`:
1257
+ - `"test"`: seq0 [EOS]
1258
+ - `"finetuning"`: [BOS] seq0
1259
+
1260
+ Args:
1261
+ token_ids_0 (`list[int]`):
1262
+ List of IDs to which the special tokens will be added.
1263
+ token_ids_1 (`None`, *optional*): None, kept to match Transformers' signature.
1264
+
1265
+ Returns:
1266
+ `list[int]`: List of input IDs with the appropriate special tokens.
1267
+ """
1268
+ if token_ids_1 is not None:
1269
+ raise ValueError(
1270
+ "`MistralCommonBackend` does not implement `token_ids_1 != None` for `build_inputs_with_special_tokens`."
1271
+ )
1272
+
1273
+ if self.mode == ValidationMode.test:
1274
+ # [BOS] seq0
1275
+ return [self.bos_token_id] + token_ids_0
1276
+
1277
+ else:
1278
+ # [BOS] seq0 [EOS]
1279
+ return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
1280
+
1281
+ def create_token_type_ids_from_sequences(self, token_ids_0: list[int], token_ids_1: None = None) -> list[int]:
1282
+ """
1283
+ Create a mask of zeroes from the token ids with special tokens added.
1284
+
1285
+ Kept to match Transformers' implementation.
1286
+
1287
+ Args:
1288
+ token_ids_0 (`list[int]`):
1289
+ List of IDs.
1290
+ token_ids_1 (`None`, *optional*): None, kept to match Transformers' signature.
1291
+
1292
+
1293
+ Returns:
1294
+ `list[int]`: Token type IDs according to the configured pattern.
1295
+ """
1296
+ if token_ids_1 is not None:
1297
+ raise ValueError(
1298
+ "`MistralCommonBackend` does not implement `token_ids_1 != None` for `create_token_type_ids_from_sequences`."
1299
+ )
1300
+
1301
+ sequence = self.build_inputs_with_special_tokens(token_ids_0)
1302
+
1303
+ return [0] * len(sequence)
1304
+
1305
+ def num_special_tokens_to_add(self, pair: Literal[False] = False) -> int:
1306
+ """
1307
+ Returns the number of added tokens when encoding a sequence with special tokens.
1308
+
1309
+ <Tip>
1310
+
1311
+ This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
1312
+ this inside your training loop.
1313
+
1314
+ </Tip>
1315
+
1316
+ Args:
1317
+ pair (`Literal[False]`, *optional*): False, kept to match Transformer's signature.
1318
+
1319
+ Returns:
1320
+ `int`: Number of special tokens added to sequences.
1321
+ """
1322
+ if pair:
1323
+ raise ValueError(
1324
+ "`MistralCommonBackend` does not implement `pair = True` for `num_special_tokens_to_add`."
1325
+ )
1326
+
1327
+ return len(self.build_inputs_with_special_tokens([], None))
1328
+
1662
1329
  @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
1663
1330
  def __call__(
1664
1331
  self,
@@ -1679,6 +1346,8 @@ class MistralCommonBackend(PushToHubMixin):
1679
1346
  return_special_tokens_mask: bool = False,
1680
1347
  return_length: bool = False,
1681
1348
  verbose: bool = True,
1349
+ return_offsets_mapping: Literal[False] = False,
1350
+ split_special_tokens: Literal[False] = False,
1682
1351
  **kwargs,
1683
1352
  ) -> BatchEncoding:
1684
1353
  """
@@ -1696,92 +1365,49 @@ class MistralCommonBackend(PushToHubMixin):
1696
1365
  text_pair_target (`None`, *optional*):
1697
1366
  Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
1698
1367
  """
1699
- if kwargs:
1700
- raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.__call__`.")
1368
+ if return_offsets_mapping or split_special_tokens:
1369
+ raise ValueError(
1370
+ "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
1371
+ )
1701
1372
 
1702
- if text_pair or text_target or text_pair_target:
1373
+ if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND, "only_first", "only_second"]:
1703
1374
  raise ValueError(
1704
- "`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonBackend`."
1375
+ "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
1705
1376
  )
1706
1377
 
1707
- def _is_valid_text_input(t):
1708
- if isinstance(t, str):
1709
- # Strings are fine
1710
- return True
1711
- elif isinstance(t, (list, tuple)):
1712
- # List are fine as long as they are...
1713
- if len(t) == 0:
1714
- # ... empty
1715
- return True
1716
- elif isinstance(t[0], (str, int)):
1717
- # ... list of strings or int
1718
- return True
1719
- elif isinstance(t[0], (list, tuple)):
1720
- # ... list with an empty list or with a list of strings or with a list of ints
1721
- return len(t[0]) == 0 or isinstance(t[0][0], (str, int))
1722
- else:
1723
- return False
1724
- else:
1725
- return False
1378
+ if kwargs:
1379
+ raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.__call__`.")
1726
1380
 
1727
- if not _is_valid_text_input(text):
1381
+ if text_pair or text_target or text_pair_target:
1728
1382
  raise ValueError(
1729
- "text input must be of type `str` (single example), `list[str]` (batch or single encoded example) "
1730
- "or `list[list[int]]` (batch of encoded examples)."
1383
+ "`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonBackend`."
1731
1384
  )
1732
1385
 
1733
- is_batched = isinstance(text, (list, tuple)) and isinstance(text[0], (str, list, tuple))
1734
-
1735
- padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
1386
+ return super().__call__(
1387
+ text=text,
1388
+ text_pair=text_pair,
1389
+ text_target=text_target,
1390
+ add_special_tokens=add_special_tokens,
1736
1391
  padding=padding,
1737
1392
  truncation=truncation,
1738
1393
  max_length=max_length,
1394
+ stride=stride,
1739
1395
  pad_to_multiple_of=pad_to_multiple_of,
1396
+ padding_side=padding_side,
1397
+ return_tensors=return_tensors,
1398
+ return_attention_mask=return_attention_mask,
1399
+ return_overflowing_tokens=return_overflowing_tokens,
1400
+ return_special_tokens_mask=return_special_tokens_mask,
1401
+ return_length=return_length,
1740
1402
  verbose=verbose,
1741
- **kwargs,
1742
1403
  )
1743
1404
 
1744
- if is_batched:
1745
- return self._batch_encode_plus(
1746
- batch_text=text,
1747
- add_special_tokens=add_special_tokens,
1748
- padding_strategy=padding_strategy,
1749
- truncation_strategy=truncation_strategy,
1750
- max_length=max_length,
1751
- stride=stride,
1752
- pad_to_multiple_of=pad_to_multiple_of,
1753
- padding_side=padding_side,
1754
- return_tensors=return_tensors,
1755
- return_attention_mask=return_attention_mask,
1756
- return_overflowing_tokens=return_overflowing_tokens,
1757
- return_special_tokens_mask=return_special_tokens_mask,
1758
- return_length=return_length,
1759
- verbose=verbose,
1760
- )
1761
- else:
1762
- return self._encode_plus(
1763
- text=text,
1764
- add_special_tokens=add_special_tokens,
1765
- padding_strategy=padding_strategy,
1766
- truncation_strategy=truncation_strategy,
1767
- max_length=max_length,
1768
- stride=stride,
1769
- pad_to_multiple_of=pad_to_multiple_of,
1770
- padding_side=padding_side,
1771
- return_tensors=return_tensors,
1772
- return_attention_mask=return_attention_mask,
1773
- return_overflowing_tokens=return_overflowing_tokens,
1774
- return_special_tokens_mask=return_special_tokens_mask,
1775
- return_length=return_length,
1776
- verbose=verbose,
1777
- )
1778
-
1779
1405
  @classmethod
1780
1406
  def from_pretrained(
1781
1407
  cls,
1782
1408
  pretrained_model_name_or_path: str | os.PathLike,
1783
1409
  *init_inputs,
1784
- mode: Union[str, ValidationMode] = ValidationMode.test,
1410
+ mode: str | ValidationMode = ValidationMode.test,
1785
1411
  cache_dir: str | os.PathLike | None = None,
1786
1412
  force_download: bool = False,
1787
1413
  local_files_only: bool = False,
@@ -1808,9 +1434,9 @@ class MistralCommonBackend(PushToHubMixin):
1808
1434
  `./my_model_directory/`.
1809
1435
  mode (`Union[str, ValidationMode]`, *optional*, defaults to `ValidationMode.test`):
1810
1436
  Validation mode for the `MistralTokenizer` tokenizer. Possible values are:
1811
- - `"finetuning"` or `ValidationMode.finetuning`: The finetuning mode.
1437
+ - `"finetuning"` or `ValidationMode.finetuning`: The fine-tuning mode.
1812
1438
  - `"test"` or `ValidationMode.test`: The test mode.
1813
- It changes how the tokenizer validates the input and prepare the request to the model.
1439
+ It changes how the tokenizer validates the input and prepares the request to the model.
1814
1440
  cache_dir (`str` or `os.PathLike`, *optional*):
1815
1441
  Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
1816
1442
  standard cache should not be used.
@@ -1837,11 +1463,11 @@ class MistralCommonBackend(PushToHubMixin):
1837
1463
  Default value is picked from the class attribute of the same name.
1838
1464
  truncation_side (`str`, *optional*, defaults to `"right"`):
1839
1465
  The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
1840
- model_input_names (`List[string]`, *optional*):
1466
+ model_input_names (`List[str]`, *optional*):
1841
1467
  The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
1842
1468
  `"attention_mask"`). Default value is picked from the class attribute of the same name.
1843
1469
  clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
1844
- Whether or not the model should cleanup the spaces that were added when splitting the input text during the
1470
+ Whether or not the model should clean up the spaces that were added when splitting the input text during the
1845
1471
  tokenization process.
1846
1472
  kwargs (additional keyword arguments, *optional*):
1847
1473
  Not supported by `MistralCommonBackend.from_pretrained`.
@@ -1851,11 +1477,13 @@ class MistralCommonBackend(PushToHubMixin):
1851
1477
  raise ValueError("`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`.")
1852
1478
 
1853
1479
  # Handle kwargs and AutoTokenizer/AutoProcessor case
1854
- # These kwargs are passed by AutoTokenizer/AutoProcessor but are not used by MistralCommonBackend
1855
- if kwargs and not set(kwargs.keys()).issubset(
1856
- {"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "_from_auto", "subfolder"}
1857
- ):
1858
- raise ValueError(f"Some kwargs in {kwargs} are not supported by `MistralCommonBackend.from_pretrained`.")
1480
+ valid_kwargs = _VALID_INIT_KWARGS.union(
1481
+ {"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "subfolder"}
1482
+ )
1483
+ if kwargs and not set(kwargs.keys()).issubset(valid_kwargs):
1484
+ raise ValueError(
1485
+ f"Some kwargs in {list(kwargs.keys())} are not supported by `MistralCommonBackend.from_pretrained`."
1486
+ )
1859
1487
 
1860
1488
  mode = cls._get_validation_mode(mode)
1861
1489
 
@@ -1869,35 +1497,8 @@ class MistralCommonBackend(PushToHubMixin):
1869
1497
  local_files_only=local_files_only,
1870
1498
  )
1871
1499
  else:
1872
- valid_tokenizer_files = []
1873
- tokenizer_file: str
1874
-
1875
- instruct_versions = list(TokenizerVersion.__members__)
1876
- mm_versions = list(MultiModalVersion.__members__) + [""] # allow no mm version
1877
- sentencepiece_suffixes = [f".model.{v}{m}" for v in instruct_versions for m in mm_versions] + [".model"]
1878
-
1879
- for path in os.listdir(pretrained_model_name_or_path):
1880
- pathlib_repo_file = Path(path)
1881
- file_name = pathlib_repo_file.name
1882
- suffix = "".join(pathlib_repo_file.suffixes)
1883
- if file_name == "tekken.json" or suffix in sentencepiece_suffixes:
1884
- valid_tokenizer_files.append(file_name)
1885
-
1886
- if len(valid_tokenizer_files) == 0:
1887
- raise ValueError(f"No tokenizer file found in directory: {pretrained_model_name_or_path}")
1888
- # If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
1889
- if len(valid_tokenizer_files) > 1:
1890
- if "tekken.json" in valid_tokenizer_files:
1891
- tokenizer_file = "tekken.json"
1892
- else:
1893
- tokenizer_file = max(valid_tokenizer_files)
1894
- logger.warning(
1895
- f"Multiple tokenizer files found in directory: {pretrained_model_name_or_path}. Using {tokenizer_file}."
1896
- )
1897
- else:
1898
- tokenizer_file = valid_tokenizer_files[0]
1899
-
1900
- tokenizer_path = os.path.join(pretrained_model_name_or_path, tokenizer_file)
1500
+ candidate_files = os.listdir(pretrained_model_name_or_path)
1501
+ tokenizer_path = os.path.join(pretrained_model_name_or_path, get_one_valid_tokenizer_file(candidate_files))
1901
1502
 
1902
1503
  return cls(
1903
1504
  tokenizer_path=tokenizer_path,
@@ -1909,7 +1510,7 @@ class MistralCommonBackend(PushToHubMixin):
1909
1510
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
1910
1511
  )
1911
1512
 
1912
- def save_pretrained(
1513
+ def save_pretrained( # type: ignore[override]
1913
1514
  self,
1914
1515
  save_directory: str | os.PathLike | Path,
1915
1516
  push_to_hub: bool = False,
@@ -1971,7 +1572,7 @@ class MistralCommonBackend(PushToHubMixin):
1971
1572
  return (str(save_directory / self._tokenizer_path.name),)
1972
1573
 
1973
1574
  @staticmethod
1974
- def _get_validation_mode(mode: Union[str, ValidationMode]) -> ValidationMode:
1575
+ def _get_validation_mode(mode: str | ValidationMode) -> ValidationMode:
1975
1576
  """Get the validation mode from a string or a ValidationMode."""
1976
1577
  _invalid_mode_msg = (
1977
1578
  f"Invalid `mistral-common` tokenizer mode: {mode}. Possible values are 'finetuning' or 'test'."
@@ -1988,6 +1589,65 @@ class MistralCommonBackend(PushToHubMixin):
1988
1589
  raise ValueError(_invalid_mode_msg)
1989
1590
  return mode
1990
1591
 
1592
+ def add_special_tokens(
1593
+ self,
1594
+ special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
1595
+ replace_extra_special_tokens: bool = True,
1596
+ ):
1597
+ r"""`MistralCommonBackend` does not implement `add_special_tokens` by design.
1598
+
1599
+ If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
1600
+ """
1601
+
1602
+ raise NotImplementedError("`MistralCommonBackend` does not implement `add_special_tokens`.")
1603
+
1604
+ def add_tokens( # type: ignore[override]
1605
+ self,
1606
+ special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
1607
+ replace_extra_special_tokens: bool = True,
1608
+ ):
1609
+ """
1610
+ `MistralCommonBackend` does not implement `add_special_tokens` by design.
1611
+
1612
+ If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
1613
+ """
1614
+
1615
+ raise NotImplementedError("`MistralCommonBackend` does not implement `add_tokens`.")
1616
+
1617
+ def convert_added_tokens(cls, obj: AddedToken | Any, save: bool = False, add_type_field: bool = True): # type: ignore[override]
1618
+ """
1619
+ `MistralCommonBackend` does not implement `convert_added_tokens` by design.
1620
+
1621
+ If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
1622
+ """
1623
+
1624
+ raise NotImplementedError("`MistralCommonBackend` does not implement `convert_added_tokens`.")
1625
+
1626
+ def get_chat_template(self, chat_template: str | None = None, tools: list[dict] | None = None) -> str:
1627
+ """`MistralCommonBackend` does not implement `get_chat_template` by design as `mistral-common` does not use chat templates."""
1628
+
1629
+ raise NotImplementedError("`MistralCommonBackend` does not implement `get_chat_template`.")
1630
+
1631
+ def save_chat_templates(
1632
+ self,
1633
+ save_directory: str | os.PathLike,
1634
+ tokenizer_config: dict,
1635
+ filename_prefix: str | None,
1636
+ save_jinja_files: bool,
1637
+ ):
1638
+ """`MistralCommonBackend` does not implement `save_chat_templates` by design as `mistral-common` does not use chat templates."""
1639
+
1640
+ raise NotImplementedError("`MistralCommonBackend` does not implement `save_chat_templates`.")
1641
+
1642
+ def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str, ...]:
1643
+ """
1644
+ `MistralCommonBackend` does not implement `save_vocabulary` by design.
1645
+
1646
+ This is because `mistral-common` is configured by one tokenizer file. If you'd like to save the vocabulary, please consider using the `save_pretrained` method instead.
1647
+ """
1648
+
1649
+ raise NotImplementedError("`MistralCommonBackend` does not implement `save_vocabulary`.")
1650
+
1991
1651
 
1992
1652
  # Backward compatibility alias for codebases still importing the legacy name.
1993
1653
  MistralCommonTokenizer = MistralCommonBackend