transformers 5.0.0rc2__py3-none-any.whl → 5.0.0rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1537) hide show
  1. transformers/__init__.py +9 -28
  2. transformers/audio_utils.py +32 -32
  3. transformers/cache_utils.py +15 -124
  4. transformers/cli/chat.py +3 -3
  5. transformers/cli/serve.py +2 -2
  6. transformers/cli/transformers.py +2 -1
  7. transformers/configuration_utils.py +31 -33
  8. transformers/conversion_mapping.py +5 -1
  9. transformers/convert_slow_tokenizer.py +3 -8
  10. transformers/core_model_loading.py +14 -15
  11. transformers/data/processors/glue.py +0 -1
  12. transformers/data/processors/utils.py +0 -1
  13. transformers/data/processors/xnli.py +0 -1
  14. transformers/dependency_versions_table.py +4 -4
  15. transformers/distributed/configuration_utils.py +1 -2
  16. transformers/dynamic_module_utils.py +23 -23
  17. transformers/feature_extraction_sequence_utils.py +19 -23
  18. transformers/feature_extraction_utils.py +14 -14
  19. transformers/generation/candidate_generator.py +1 -2
  20. transformers/generation/configuration_utils.py +54 -39
  21. transformers/generation/continuous_batching/__init__.py +0 -1
  22. transformers/generation/continuous_batching/cache.py +34 -6
  23. transformers/generation/continuous_batching/cache_manager.py +25 -12
  24. transformers/generation/continuous_batching/continuous_api.py +54 -23
  25. transformers/generation/continuous_batching/requests.py +25 -4
  26. transformers/generation/continuous_batching/scheduler.py +117 -49
  27. transformers/generation/logits_process.py +0 -128
  28. transformers/generation/streamers.py +0 -1
  29. transformers/generation/utils.py +16 -26
  30. transformers/generation/watermarking.py +2 -3
  31. transformers/hf_argparser.py +9 -13
  32. transformers/hyperparameter_search.py +1 -2
  33. transformers/image_processing_base.py +9 -9
  34. transformers/image_processing_utils.py +11 -12
  35. transformers/image_processing_utils_fast.py +53 -53
  36. transformers/image_transforms.py +29 -29
  37. transformers/image_utils.py +30 -32
  38. transformers/integrations/awq.py +1 -3
  39. transformers/integrations/deepspeed.py +1 -1
  40. transformers/integrations/eetq.py +0 -1
  41. transformers/integrations/fbgemm_fp8.py +1 -2
  42. transformers/integrations/finegrained_fp8.py +8 -7
  43. transformers/integrations/flash_attention.py +1 -1
  44. transformers/integrations/flex_attention.py +1 -1
  45. transformers/integrations/fp_quant.py +4 -6
  46. transformers/integrations/ggml.py +0 -1
  47. transformers/integrations/integration_utils.py +2 -3
  48. transformers/integrations/mxfp4.py +5 -6
  49. transformers/integrations/quark.py +2 -4
  50. transformers/integrations/torchao.py +4 -6
  51. transformers/loss/loss_lw_detr.py +356 -0
  52. transformers/loss/loss_utils.py +2 -0
  53. transformers/masking_utils.py +47 -51
  54. transformers/model_debugging_utils.py +4 -5
  55. transformers/modelcard.py +14 -192
  56. transformers/modeling_attn_mask_utils.py +19 -19
  57. transformers/modeling_flash_attention_utils.py +27 -27
  58. transformers/modeling_gguf_pytorch_utils.py +5 -5
  59. transformers/modeling_layers.py +21 -22
  60. transformers/modeling_outputs.py +242 -253
  61. transformers/modeling_rope_utils.py +32 -32
  62. transformers/modeling_utils.py +67 -90
  63. transformers/models/__init__.py +4 -0
  64. transformers/models/afmoe/configuration_afmoe.py +26 -29
  65. transformers/models/afmoe/modeling_afmoe.py +30 -33
  66. transformers/models/afmoe/modular_afmoe.py +16 -18
  67. transformers/models/aimv2/configuration_aimv2.py +2 -5
  68. transformers/models/aimv2/modeling_aimv2.py +20 -21
  69. transformers/models/aimv2/modular_aimv2.py +7 -9
  70. transformers/models/albert/configuration_albert.py +0 -1
  71. transformers/models/albert/modeling_albert.py +67 -69
  72. transformers/models/albert/tokenization_albert.py +1 -4
  73. transformers/models/align/configuration_align.py +0 -1
  74. transformers/models/align/modeling_align.py +61 -62
  75. transformers/models/align/processing_align.py +2 -30
  76. transformers/models/altclip/configuration_altclip.py +0 -1
  77. transformers/models/altclip/modeling_altclip.py +76 -77
  78. transformers/models/altclip/processing_altclip.py +2 -15
  79. transformers/models/apertus/__init__.py +0 -1
  80. transformers/models/apertus/configuration_apertus.py +18 -21
  81. transformers/models/apertus/modeling_apertus.py +31 -34
  82. transformers/models/apertus/modular_apertus.py +28 -30
  83. transformers/models/arcee/configuration_arcee.py +20 -23
  84. transformers/models/arcee/modeling_arcee.py +31 -34
  85. transformers/models/arcee/modular_arcee.py +20 -23
  86. transformers/models/aria/configuration_aria.py +20 -23
  87. transformers/models/aria/image_processing_aria.py +25 -27
  88. transformers/models/aria/modeling_aria.py +63 -66
  89. transformers/models/aria/modular_aria.py +78 -85
  90. transformers/models/aria/processing_aria.py +28 -35
  91. transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +0 -1
  92. transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +3 -6
  93. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +6 -8
  94. transformers/models/audioflamingo3/__init__.py +0 -1
  95. transformers/models/audioflamingo3/configuration_audioflamingo3.py +0 -1
  96. transformers/models/audioflamingo3/modeling_audioflamingo3.py +22 -23
  97. transformers/models/audioflamingo3/modular_audioflamingo3.py +12 -17
  98. transformers/models/audioflamingo3/processing_audioflamingo3.py +6 -8
  99. transformers/models/auto/auto_factory.py +4 -5
  100. transformers/models/auto/configuration_auto.py +26 -5
  101. transformers/models/auto/feature_extraction_auto.py +5 -7
  102. transformers/models/auto/image_processing_auto.py +13 -26
  103. transformers/models/auto/modeling_auto.py +18 -199
  104. transformers/models/auto/processing_auto.py +2 -1
  105. transformers/models/auto/tokenization_auto.py +21 -22
  106. transformers/models/auto/video_processing_auto.py +7 -8
  107. transformers/models/autoformer/configuration_autoformer.py +4 -7
  108. transformers/models/autoformer/modeling_autoformer.py +98 -100
  109. transformers/models/aya_vision/configuration_aya_vision.py +0 -1
  110. transformers/models/aya_vision/modeling_aya_vision.py +35 -37
  111. transformers/models/aya_vision/modular_aya_vision.py +26 -29
  112. transformers/models/aya_vision/processing_aya_vision.py +25 -53
  113. transformers/models/bamba/configuration_bamba.py +29 -32
  114. transformers/models/bamba/modeling_bamba.py +60 -64
  115. transformers/models/bamba/modular_bamba.py +51 -55
  116. transformers/models/bark/configuration_bark.py +4 -7
  117. transformers/models/bark/generation_configuration_bark.py +3 -5
  118. transformers/models/bark/modeling_bark.py +40 -55
  119. transformers/models/bark/processing_bark.py +19 -41
  120. transformers/models/bart/configuration_bart.py +0 -1
  121. transformers/models/bart/modeling_bart.py +115 -117
  122. transformers/models/barthez/tokenization_barthez.py +1 -4
  123. transformers/models/bartpho/tokenization_bartpho.py +6 -7
  124. transformers/models/beit/configuration_beit.py +0 -11
  125. transformers/models/beit/image_processing_beit.py +53 -56
  126. transformers/models/beit/image_processing_beit_fast.py +8 -9
  127. transformers/models/beit/modeling_beit.py +51 -53
  128. transformers/models/bert/configuration_bert.py +0 -1
  129. transformers/models/bert/modeling_bert.py +111 -122
  130. transformers/models/bert/tokenization_bert.py +2 -4
  131. transformers/models/bert/tokenization_bert_legacy.py +3 -5
  132. transformers/models/bert_generation/configuration_bert_generation.py +0 -1
  133. transformers/models/bert_generation/modeling_bert_generation.py +47 -49
  134. transformers/models/bert_generation/tokenization_bert_generation.py +2 -3
  135. transformers/models/bert_japanese/tokenization_bert_japanese.py +5 -6
  136. transformers/models/bertweet/tokenization_bertweet.py +1 -3
  137. transformers/models/big_bird/configuration_big_bird.py +0 -1
  138. transformers/models/big_bird/modeling_big_bird.py +107 -109
  139. transformers/models/big_bird/tokenization_big_bird.py +1 -4
  140. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +0 -1
  141. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +109 -111
  142. transformers/models/biogpt/configuration_biogpt.py +0 -1
  143. transformers/models/biogpt/modeling_biogpt.py +69 -71
  144. transformers/models/biogpt/modular_biogpt.py +59 -61
  145. transformers/models/biogpt/tokenization_biogpt.py +3 -5
  146. transformers/models/bit/configuration_bit.py +0 -1
  147. transformers/models/bit/image_processing_bit.py +21 -24
  148. transformers/models/bit/image_processing_bit_fast.py +0 -1
  149. transformers/models/bit/modeling_bit.py +9 -11
  150. transformers/models/bitnet/configuration_bitnet.py +18 -21
  151. transformers/models/bitnet/modeling_bitnet.py +31 -34
  152. transformers/models/bitnet/modular_bitnet.py +4 -6
  153. transformers/models/blenderbot/configuration_blenderbot.py +0 -1
  154. transformers/models/blenderbot/modeling_blenderbot.py +64 -95
  155. transformers/models/blenderbot/tokenization_blenderbot.py +0 -1
  156. transformers/models/blenderbot_small/configuration_blenderbot_small.py +0 -1
  157. transformers/models/blenderbot_small/modeling_blenderbot_small.py +66 -68
  158. transformers/models/blenderbot_small/tokenization_blenderbot_small.py +1 -3
  159. transformers/models/blip/configuration_blip.py +0 -1
  160. transformers/models/blip/image_processing_blip.py +17 -20
  161. transformers/models/blip/image_processing_blip_fast.py +0 -1
  162. transformers/models/blip/modeling_blip.py +60 -71
  163. transformers/models/blip/modeling_blip_text.py +63 -65
  164. transformers/models/blip/processing_blip.py +5 -36
  165. transformers/models/blip_2/configuration_blip_2.py +0 -1
  166. transformers/models/blip_2/modeling_blip_2.py +70 -71
  167. transformers/models/blip_2/processing_blip_2.py +8 -38
  168. transformers/models/bloom/configuration_bloom.py +0 -1
  169. transformers/models/bloom/modeling_bloom.py +58 -59
  170. transformers/models/blt/configuration_blt.py +71 -74
  171. transformers/models/blt/modeling_blt.py +73 -76
  172. transformers/models/blt/modular_blt.py +57 -59
  173. transformers/models/bridgetower/configuration_bridgetower.py +0 -1
  174. transformers/models/bridgetower/image_processing_bridgetower.py +34 -35
  175. transformers/models/bridgetower/image_processing_bridgetower_fast.py +7 -8
  176. transformers/models/bridgetower/modeling_bridgetower.py +107 -109
  177. transformers/models/bridgetower/processing_bridgetower.py +2 -16
  178. transformers/models/bros/configuration_bros.py +0 -1
  179. transformers/models/bros/modeling_bros.py +78 -80
  180. transformers/models/bros/processing_bros.py +2 -12
  181. transformers/models/byt5/tokenization_byt5.py +4 -6
  182. transformers/models/camembert/configuration_camembert.py +0 -1
  183. transformers/models/camembert/modeling_camembert.py +91 -93
  184. transformers/models/camembert/modular_camembert.py +51 -54
  185. transformers/models/camembert/tokenization_camembert.py +1 -4
  186. transformers/models/canine/configuration_canine.py +0 -1
  187. transformers/models/canine/modeling_canine.py +73 -75
  188. transformers/models/canine/tokenization_canine.py +0 -1
  189. transformers/models/chameleon/configuration_chameleon.py +24 -27
  190. transformers/models/chameleon/image_processing_chameleon.py +21 -24
  191. transformers/models/chameleon/image_processing_chameleon_fast.py +0 -1
  192. transformers/models/chameleon/modeling_chameleon.py +53 -56
  193. transformers/models/chameleon/processing_chameleon.py +16 -41
  194. transformers/models/chinese_clip/configuration_chinese_clip.py +0 -1
  195. transformers/models/chinese_clip/image_processing_chinese_clip.py +21 -24
  196. transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +0 -1
  197. transformers/models/chinese_clip/modeling_chinese_clip.py +65 -66
  198. transformers/models/chinese_clip/processing_chinese_clip.py +2 -15
  199. transformers/models/clap/configuration_clap.py +0 -1
  200. transformers/models/clap/feature_extraction_clap.py +9 -10
  201. transformers/models/clap/modeling_clap.py +88 -89
  202. transformers/models/clap/processing_clap.py +2 -15
  203. transformers/models/clip/configuration_clip.py +0 -1
  204. transformers/models/clip/image_processing_clip.py +21 -24
  205. transformers/models/clip/image_processing_clip_fast.py +0 -1
  206. transformers/models/clip/modeling_clip.py +45 -46
  207. transformers/models/clip/processing_clip.py +2 -14
  208. transformers/models/clip/tokenization_clip.py +2 -5
  209. transformers/models/clipseg/configuration_clipseg.py +0 -1
  210. transformers/models/clipseg/modeling_clipseg.py +86 -87
  211. transformers/models/clipseg/processing_clipseg.py +8 -39
  212. transformers/models/clvp/configuration_clvp.py +1 -3
  213. transformers/models/clvp/feature_extraction_clvp.py +7 -10
  214. transformers/models/clvp/modeling_clvp.py +119 -115
  215. transformers/models/clvp/number_normalizer.py +1 -2
  216. transformers/models/clvp/processing_clvp.py +3 -20
  217. transformers/models/clvp/tokenization_clvp.py +0 -1
  218. transformers/models/code_llama/tokenization_code_llama.py +3 -6
  219. transformers/models/codegen/configuration_codegen.py +0 -1
  220. transformers/models/codegen/modeling_codegen.py +48 -48
  221. transformers/models/codegen/tokenization_codegen.py +5 -6
  222. transformers/models/cohere/configuration_cohere.py +20 -23
  223. transformers/models/cohere/modeling_cohere.py +35 -38
  224. transformers/models/cohere/modular_cohere.py +24 -28
  225. transformers/models/cohere/tokenization_cohere.py +5 -6
  226. transformers/models/cohere2/configuration_cohere2.py +21 -24
  227. transformers/models/cohere2/modeling_cohere2.py +34 -37
  228. transformers/models/cohere2/modular_cohere2.py +39 -41
  229. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +6 -7
  230. transformers/models/cohere2_vision/modeling_cohere2_vision.py +28 -30
  231. transformers/models/cohere2_vision/modular_cohere2_vision.py +21 -23
  232. transformers/models/cohere2_vision/processing_cohere2_vision.py +6 -36
  233. transformers/models/colpali/configuration_colpali.py +0 -1
  234. transformers/models/colpali/modeling_colpali.py +14 -16
  235. transformers/models/colpali/modular_colpali.py +11 -51
  236. transformers/models/colpali/processing_colpali.py +14 -52
  237. transformers/models/colqwen2/modeling_colqwen2.py +20 -22
  238. transformers/models/colqwen2/modular_colqwen2.py +29 -68
  239. transformers/models/colqwen2/processing_colqwen2.py +16 -52
  240. transformers/models/conditional_detr/configuration_conditional_detr.py +0 -1
  241. transformers/models/conditional_detr/image_processing_conditional_detr.py +64 -66
  242. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +22 -22
  243. transformers/models/conditional_detr/modeling_conditional_detr.py +78 -80
  244. transformers/models/conditional_detr/modular_conditional_detr.py +1 -3
  245. transformers/models/convbert/configuration_convbert.py +0 -1
  246. transformers/models/convbert/modeling_convbert.py +85 -87
  247. transformers/models/convbert/tokenization_convbert.py +0 -1
  248. transformers/models/convnext/configuration_convnext.py +0 -1
  249. transformers/models/convnext/image_processing_convnext.py +18 -21
  250. transformers/models/convnext/image_processing_convnext_fast.py +5 -6
  251. transformers/models/convnext/modeling_convnext.py +5 -8
  252. transformers/models/convnextv2/configuration_convnextv2.py +0 -1
  253. transformers/models/convnextv2/modeling_convnextv2.py +5 -8
  254. transformers/models/cpm/tokenization_cpm.py +6 -7
  255. transformers/models/cpm/tokenization_cpm_fast.py +3 -5
  256. transformers/models/cpmant/configuration_cpmant.py +0 -1
  257. transformers/models/cpmant/modeling_cpmant.py +38 -40
  258. transformers/models/cpmant/tokenization_cpmant.py +1 -3
  259. transformers/models/csm/configuration_csm.py +49 -51
  260. transformers/models/csm/generation_csm.py +13 -14
  261. transformers/models/csm/modeling_csm.py +78 -81
  262. transformers/models/csm/modular_csm.py +56 -58
  263. transformers/models/csm/processing_csm.py +25 -68
  264. transformers/models/ctrl/configuration_ctrl.py +0 -1
  265. transformers/models/ctrl/modeling_ctrl.py +38 -41
  266. transformers/models/ctrl/tokenization_ctrl.py +0 -1
  267. transformers/models/cvt/configuration_cvt.py +0 -1
  268. transformers/models/cvt/modeling_cvt.py +13 -15
  269. transformers/models/cwm/__init__.py +0 -1
  270. transformers/models/cwm/configuration_cwm.py +3 -5
  271. transformers/models/cwm/modeling_cwm.py +32 -34
  272. transformers/models/cwm/modular_cwm.py +10 -12
  273. transformers/models/d_fine/configuration_d_fine.py +0 -1
  274. transformers/models/d_fine/modeling_d_fine.py +81 -82
  275. transformers/models/d_fine/modular_d_fine.py +8 -9
  276. transformers/models/dab_detr/configuration_dab_detr.py +0 -1
  277. transformers/models/dab_detr/modeling_dab_detr.py +68 -70
  278. transformers/models/dac/configuration_dac.py +0 -1
  279. transformers/models/dac/feature_extraction_dac.py +6 -9
  280. transformers/models/dac/modeling_dac.py +21 -23
  281. transformers/models/data2vec/configuration_data2vec_audio.py +0 -1
  282. transformers/models/data2vec/configuration_data2vec_text.py +0 -1
  283. transformers/models/data2vec/configuration_data2vec_vision.py +0 -1
  284. transformers/models/data2vec/modeling_data2vec_audio.py +52 -56
  285. transformers/models/data2vec/modeling_data2vec_text.py +91 -93
  286. transformers/models/data2vec/modeling_data2vec_vision.py +41 -42
  287. transformers/models/data2vec/modular_data2vec_audio.py +6 -1
  288. transformers/models/data2vec/modular_data2vec_text.py +51 -54
  289. transformers/models/dbrx/configuration_dbrx.py +18 -19
  290. transformers/models/dbrx/modeling_dbrx.py +39 -42
  291. transformers/models/dbrx/modular_dbrx.py +31 -33
  292. transformers/models/deberta/configuration_deberta.py +0 -1
  293. transformers/models/deberta/modeling_deberta.py +57 -60
  294. transformers/models/deberta/tokenization_deberta.py +2 -5
  295. transformers/models/deberta_v2/configuration_deberta_v2.py +0 -1
  296. transformers/models/deberta_v2/modeling_deberta_v2.py +63 -65
  297. transformers/models/deberta_v2/tokenization_deberta_v2.py +1 -4
  298. transformers/models/decision_transformer/configuration_decision_transformer.py +0 -1
  299. transformers/models/decision_transformer/modeling_decision_transformer.py +48 -50
  300. transformers/models/deepseek_v2/configuration_deepseek_v2.py +34 -37
  301. transformers/models/deepseek_v2/modeling_deepseek_v2.py +32 -33
  302. transformers/models/deepseek_v2/modular_deepseek_v2.py +40 -42
  303. transformers/models/deepseek_v3/configuration_deepseek_v3.py +35 -38
  304. transformers/models/deepseek_v3/modeling_deepseek_v3.py +31 -33
  305. transformers/models/deepseek_v3/modular_deepseek_v3.py +4 -5
  306. transformers/models/deepseek_vl/configuration_deepseek_vl.py +2 -3
  307. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +25 -26
  308. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +7 -6
  309. transformers/models/deepseek_vl/modeling_deepseek_vl.py +31 -31
  310. transformers/models/deepseek_vl/modular_deepseek_vl.py +11 -43
  311. transformers/models/deepseek_vl/processing_deepseek_vl.py +10 -41
  312. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +3 -5
  313. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +35 -35
  314. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +16 -16
  315. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +33 -33
  316. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +71 -90
  317. transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +12 -44
  318. transformers/models/deformable_detr/configuration_deformable_detr.py +0 -1
  319. transformers/models/deformable_detr/image_processing_deformable_detr.py +59 -61
  320. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +17 -17
  321. transformers/models/deformable_detr/modeling_deformable_detr.py +66 -67
  322. transformers/models/deformable_detr/modular_deformable_detr.py +1 -3
  323. transformers/models/deit/configuration_deit.py +0 -1
  324. transformers/models/deit/image_processing_deit.py +18 -21
  325. transformers/models/deit/image_processing_deit_fast.py +0 -1
  326. transformers/models/deit/modeling_deit.py +16 -18
  327. transformers/models/depth_anything/configuration_depth_anything.py +0 -1
  328. transformers/models/depth_anything/modeling_depth_anything.py +5 -8
  329. transformers/models/depth_pro/configuration_depth_pro.py +0 -1
  330. transformers/models/depth_pro/image_processing_depth_pro.py +22 -23
  331. transformers/models/depth_pro/image_processing_depth_pro_fast.py +6 -7
  332. transformers/models/depth_pro/modeling_depth_pro.py +21 -23
  333. transformers/models/detr/configuration_detr.py +0 -1
  334. transformers/models/detr/image_processing_detr.py +64 -66
  335. transformers/models/detr/image_processing_detr_fast.py +22 -23
  336. transformers/models/detr/modeling_detr.py +70 -72
  337. transformers/models/dia/configuration_dia.py +5 -8
  338. transformers/models/dia/feature_extraction_dia.py +6 -9
  339. transformers/models/dia/generation_dia.py +40 -36
  340. transformers/models/dia/modeling_dia.py +61 -64
  341. transformers/models/dia/modular_dia.py +52 -54
  342. transformers/models/dia/processing_dia.py +39 -29
  343. transformers/models/dia/tokenization_dia.py +3 -6
  344. transformers/models/diffllama/configuration_diffllama.py +20 -23
  345. transformers/models/diffllama/modeling_diffllama.py +42 -45
  346. transformers/models/diffllama/modular_diffllama.py +16 -18
  347. transformers/models/dinat/configuration_dinat.py +0 -1
  348. transformers/models/dinat/modeling_dinat.py +40 -42
  349. transformers/models/dinov2/configuration_dinov2.py +0 -1
  350. transformers/models/dinov2/modeling_dinov2.py +11 -13
  351. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +1 -1
  352. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +12 -13
  353. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +5 -7
  354. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +4 -7
  355. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +3 -6
  356. transformers/models/dinov3_vit/configuration_dinov3_vit.py +5 -8
  357. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +5 -6
  358. transformers/models/dinov3_vit/modeling_dinov3_vit.py +14 -16
  359. transformers/models/dinov3_vit/modular_dinov3_vit.py +11 -13
  360. transformers/models/distilbert/configuration_distilbert.py +0 -1
  361. transformers/models/distilbert/modeling_distilbert.py +44 -46
  362. transformers/models/distilbert/tokenization_distilbert.py +0 -1
  363. transformers/models/doge/__init__.py +0 -1
  364. transformers/models/doge/configuration_doge.py +25 -28
  365. transformers/models/doge/modeling_doge.py +42 -45
  366. transformers/models/doge/modular_doge.py +57 -58
  367. transformers/models/donut/configuration_donut_swin.py +0 -1
  368. transformers/models/donut/image_processing_donut.py +26 -29
  369. transformers/models/donut/image_processing_donut_fast.py +5 -10
  370. transformers/models/donut/modeling_donut_swin.py +44 -46
  371. transformers/models/donut/processing_donut.py +5 -26
  372. transformers/models/dots1/configuration_dots1.py +27 -29
  373. transformers/models/dots1/modeling_dots1.py +31 -34
  374. transformers/models/dots1/modular_dots1.py +0 -1
  375. transformers/models/dpr/configuration_dpr.py +0 -1
  376. transformers/models/dpr/modeling_dpr.py +37 -39
  377. transformers/models/dpr/tokenization_dpr.py +7 -9
  378. transformers/models/dpr/tokenization_dpr_fast.py +7 -9
  379. transformers/models/dpt/configuration_dpt.py +0 -1
  380. transformers/models/dpt/image_processing_dpt.py +65 -66
  381. transformers/models/dpt/image_processing_dpt_fast.py +13 -14
  382. transformers/models/dpt/modeling_dpt.py +19 -21
  383. transformers/models/dpt/modular_dpt.py +10 -11
  384. transformers/models/edgetam/configuration_edgetam.py +0 -1
  385. transformers/models/edgetam/modeling_edgetam.py +39 -41
  386. transformers/models/edgetam/modular_edgetam.py +2 -6
  387. transformers/models/edgetam_video/__init__.py +0 -1
  388. transformers/models/edgetam_video/configuration_edgetam_video.py +0 -1
  389. transformers/models/edgetam_video/modeling_edgetam_video.py +76 -77
  390. transformers/models/edgetam_video/modular_edgetam_video.py +16 -18
  391. transformers/models/efficientloftr/configuration_efficientloftr.py +4 -5
  392. transformers/models/efficientloftr/image_processing_efficientloftr.py +14 -16
  393. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -4
  394. transformers/models/efficientloftr/modeling_efficientloftr.py +27 -29
  395. transformers/models/efficientloftr/modular_efficientloftr.py +1 -3
  396. transformers/models/efficientnet/configuration_efficientnet.py +0 -1
  397. transformers/models/efficientnet/image_processing_efficientnet.py +23 -26
  398. transformers/models/efficientnet/image_processing_efficientnet_fast.py +14 -15
  399. transformers/models/efficientnet/modeling_efficientnet.py +12 -14
  400. transformers/models/electra/configuration_electra.py +0 -1
  401. transformers/models/electra/modeling_electra.py +101 -103
  402. transformers/models/emu3/configuration_emu3.py +5 -7
  403. transformers/models/emu3/image_processing_emu3.py +44 -39
  404. transformers/models/emu3/modeling_emu3.py +59 -62
  405. transformers/models/emu3/modular_emu3.py +32 -34
  406. transformers/models/emu3/processing_emu3.py +18 -43
  407. transformers/models/encodec/configuration_encodec.py +2 -4
  408. transformers/models/encodec/feature_extraction_encodec.py +10 -13
  409. transformers/models/encodec/modeling_encodec.py +25 -29
  410. transformers/models/encoder_decoder/configuration_encoder_decoder.py +0 -1
  411. transformers/models/encoder_decoder/modeling_encoder_decoder.py +17 -19
  412. transformers/models/eomt/configuration_eomt.py +0 -1
  413. transformers/models/eomt/image_processing_eomt.py +53 -55
  414. transformers/models/eomt/image_processing_eomt_fast.py +15 -16
  415. transformers/models/eomt/modeling_eomt.py +16 -18
  416. transformers/models/eomt/modular_eomt.py +11 -13
  417. transformers/models/ernie/configuration_ernie.py +0 -1
  418. transformers/models/ernie/modeling_ernie.py +121 -132
  419. transformers/models/ernie/modular_ernie.py +91 -103
  420. transformers/models/ernie4_5/configuration_ernie4_5.py +18 -20
  421. transformers/models/ernie4_5/modeling_ernie4_5.py +31 -33
  422. transformers/models/ernie4_5/modular_ernie4_5.py +1 -3
  423. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +27 -29
  424. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +36 -38
  425. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +7 -9
  426. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +0 -1
  427. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +34 -35
  428. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +6 -7
  429. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +84 -87
  430. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +86 -89
  431. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +3 -5
  432. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +17 -18
  433. transformers/models/esm/configuration_esm.py +2 -4
  434. transformers/models/esm/modeling_esm.py +32 -34
  435. transformers/models/esm/modeling_esmfold.py +42 -44
  436. transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
  437. transformers/models/esm/openfold_utils/loss.py +1 -2
  438. transformers/models/esm/openfold_utils/protein.py +13 -13
  439. transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
  440. transformers/models/esm/tokenization_esm.py +2 -4
  441. transformers/models/evolla/configuration_evolla.py +29 -32
  442. transformers/models/evolla/modeling_evolla.py +58 -61
  443. transformers/models/evolla/modular_evolla.py +45 -47
  444. transformers/models/evolla/processing_evolla.py +23 -35
  445. transformers/models/exaone4/configuration_exaone4.py +19 -22
  446. transformers/models/exaone4/modeling_exaone4.py +32 -35
  447. transformers/models/exaone4/modular_exaone4.py +40 -42
  448. transformers/models/falcon/configuration_falcon.py +22 -25
  449. transformers/models/falcon/modeling_falcon.py +73 -76
  450. transformers/models/falcon_h1/configuration_falcon_h1.py +40 -43
  451. transformers/models/falcon_h1/modeling_falcon_h1.py +52 -55
  452. transformers/models/falcon_h1/modular_falcon_h1.py +47 -48
  453. transformers/models/falcon_mamba/configuration_falcon_mamba.py +0 -1
  454. transformers/models/falcon_mamba/modeling_falcon_mamba.py +46 -47
  455. transformers/models/falcon_mamba/modular_falcon_mamba.py +10 -13
  456. transformers/models/fast_vlm/configuration_fast_vlm.py +1 -0
  457. transformers/models/fast_vlm/modeling_fast_vlm.py +36 -36
  458. transformers/models/fast_vlm/modular_fast_vlm.py +2 -3
  459. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +2 -5
  460. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +45 -47
  461. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -3
  462. transformers/models/flaubert/configuration_flaubert.py +0 -1
  463. transformers/models/flaubert/modeling_flaubert.py +124 -128
  464. transformers/models/flaubert/tokenization_flaubert.py +3 -5
  465. transformers/models/flava/configuration_flava.py +5 -6
  466. transformers/models/flava/image_processing_flava.py +66 -67
  467. transformers/models/flava/image_processing_flava_fast.py +42 -43
  468. transformers/models/flava/modeling_flava.py +108 -107
  469. transformers/models/flava/processing_flava.py +2 -12
  470. transformers/models/flex_olmo/__init__.py +0 -1
  471. transformers/models/flex_olmo/configuration_flex_olmo.py +23 -25
  472. transformers/models/flex_olmo/modeling_flex_olmo.py +37 -39
  473. transformers/models/flex_olmo/modular_flex_olmo.py +35 -37
  474. transformers/models/florence2/configuration_florence2.py +0 -1
  475. transformers/models/florence2/modeling_florence2.py +39 -40
  476. transformers/models/florence2/modular_florence2.py +52 -81
  477. transformers/models/florence2/processing_florence2.py +18 -47
  478. transformers/models/fnet/configuration_fnet.py +0 -1
  479. transformers/models/fnet/modeling_fnet.py +69 -80
  480. transformers/models/fnet/tokenization_fnet.py +0 -1
  481. transformers/models/focalnet/configuration_focalnet.py +0 -1
  482. transformers/models/focalnet/modeling_focalnet.py +39 -41
  483. transformers/models/fsmt/configuration_fsmt.py +0 -1
  484. transformers/models/fsmt/modeling_fsmt.py +47 -48
  485. transformers/models/fsmt/tokenization_fsmt.py +3 -5
  486. transformers/models/funnel/configuration_funnel.py +0 -1
  487. transformers/models/funnel/modeling_funnel.py +91 -93
  488. transformers/models/funnel/tokenization_funnel.py +2 -5
  489. transformers/models/fuyu/configuration_fuyu.py +23 -26
  490. transformers/models/fuyu/image_processing_fuyu.py +29 -31
  491. transformers/models/fuyu/image_processing_fuyu_fast.py +12 -13
  492. transformers/models/fuyu/modeling_fuyu.py +26 -29
  493. transformers/models/fuyu/processing_fuyu.py +9 -36
  494. transformers/models/gemma/configuration_gemma.py +20 -23
  495. transformers/models/gemma/modeling_gemma.py +32 -34
  496. transformers/models/gemma/modular_gemma.py +28 -29
  497. transformers/models/gemma/tokenization_gemma.py +3 -6
  498. transformers/models/gemma2/configuration_gemma2.py +25 -28
  499. transformers/models/gemma2/modeling_gemma2.py +34 -37
  500. transformers/models/gemma2/modular_gemma2.py +55 -57
  501. transformers/models/gemma3/configuration_gemma3.py +28 -29
  502. transformers/models/gemma3/image_processing_gemma3.py +29 -31
  503. transformers/models/gemma3/image_processing_gemma3_fast.py +9 -10
  504. transformers/models/gemma3/modeling_gemma3.py +86 -89
  505. transformers/models/gemma3/modular_gemma3.py +85 -86
  506. transformers/models/gemma3/processing_gemma3.py +5 -5
  507. transformers/models/gemma3n/configuration_gemma3n.py +9 -10
  508. transformers/models/gemma3n/feature_extraction_gemma3n.py +9 -11
  509. transformers/models/gemma3n/modeling_gemma3n.py +80 -89
  510. transformers/models/gemma3n/modular_gemma3n.py +66 -75
  511. transformers/models/gemma3n/processing_gemma3n.py +12 -26
  512. transformers/models/git/configuration_git.py +0 -1
  513. transformers/models/git/modeling_git.py +84 -86
  514. transformers/models/git/processing_git.py +2 -14
  515. transformers/models/glm/configuration_glm.py +19 -21
  516. transformers/models/glm/modeling_glm.py +32 -35
  517. transformers/models/glm/modular_glm.py +4 -7
  518. transformers/models/glm4/configuration_glm4.py +19 -21
  519. transformers/models/glm4/modeling_glm4.py +35 -37
  520. transformers/models/glm4/modular_glm4.py +8 -10
  521. transformers/models/glm46v/configuration_glm46v.py +0 -1
  522. transformers/models/glm46v/image_processing_glm46v.py +35 -36
  523. transformers/models/glm46v/image_processing_glm46v_fast.py +7 -7
  524. transformers/models/glm46v/modeling_glm46v.py +51 -51
  525. transformers/models/glm46v/modular_glm46v.py +1 -3
  526. transformers/models/glm46v/processing_glm46v.py +7 -41
  527. transformers/models/glm46v/video_processing_glm46v.py +9 -11
  528. transformers/models/glm4_moe/configuration_glm4_moe.py +25 -28
  529. transformers/models/glm4_moe/modeling_glm4_moe.py +32 -35
  530. transformers/models/glm4_moe/modular_glm4_moe.py +26 -29
  531. transformers/models/glm4_moe_lite/__init__.py +28 -0
  532. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +235 -0
  533. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +740 -0
  534. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +304 -0
  535. transformers/models/glm4v/configuration_glm4v.py +14 -17
  536. transformers/models/glm4v/image_processing_glm4v.py +34 -36
  537. transformers/models/glm4v/image_processing_glm4v_fast.py +6 -7
  538. transformers/models/glm4v/modeling_glm4v.py +133 -151
  539. transformers/models/glm4v/modular_glm4v.py +131 -182
  540. transformers/models/glm4v/processing_glm4v.py +7 -41
  541. transformers/models/glm4v/video_processing_glm4v.py +9 -11
  542. transformers/models/glm4v_moe/configuration_glm4v_moe.py +119 -122
  543. transformers/models/glm4v_moe/modeling_glm4v_moe.py +237 -297
  544. transformers/models/glm4v_moe/modular_glm4v_moe.py +54 -163
  545. transformers/models/glm_image/__init__.py +31 -0
  546. transformers/models/glm_image/configuration_glm_image.py +352 -0
  547. transformers/models/glm_image/image_processing_glm_image.py +503 -0
  548. transformers/models/glm_image/image_processing_glm_image_fast.py +296 -0
  549. transformers/models/glm_image/modeling_glm_image.py +1590 -0
  550. transformers/models/glm_image/modular_glm_image.py +1480 -0
  551. transformers/models/glm_image/processing_glm_image.py +217 -0
  552. transformers/models/glmasr/__init__.py +0 -1
  553. transformers/models/glmasr/configuration_glmasr.py +0 -1
  554. transformers/models/glmasr/modeling_glmasr.py +17 -18
  555. transformers/models/glmasr/modular_glmasr.py +16 -18
  556. transformers/models/glmasr/processing_glmasr.py +7 -8
  557. transformers/models/glpn/configuration_glpn.py +0 -1
  558. transformers/models/glpn/image_processing_glpn.py +11 -12
  559. transformers/models/glpn/image_processing_glpn_fast.py +8 -9
  560. transformers/models/glpn/modeling_glpn.py +10 -12
  561. transformers/models/got_ocr2/configuration_got_ocr2.py +5 -8
  562. transformers/models/got_ocr2/image_processing_got_ocr2.py +22 -24
  563. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +6 -7
  564. transformers/models/got_ocr2/modeling_got_ocr2.py +40 -42
  565. transformers/models/got_ocr2/modular_got_ocr2.py +31 -34
  566. transformers/models/got_ocr2/processing_got_ocr2.py +42 -63
  567. transformers/models/gpt2/configuration_gpt2.py +0 -1
  568. transformers/models/gpt2/modeling_gpt2.py +106 -108
  569. transformers/models/gpt2/tokenization_gpt2.py +6 -9
  570. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +0 -1
  571. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +73 -80
  572. transformers/models/gpt_neo/configuration_gpt_neo.py +0 -1
  573. transformers/models/gpt_neo/modeling_gpt_neo.py +63 -64
  574. transformers/models/gpt_neox/configuration_gpt_neox.py +19 -22
  575. transformers/models/gpt_neox/modeling_gpt_neox.py +70 -72
  576. transformers/models/gpt_neox/modular_gpt_neox.py +64 -66
  577. transformers/models/gpt_neox/tokenization_gpt_neox.py +2 -5
  578. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +15 -18
  579. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +41 -44
  580. transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +1 -3
  581. transformers/models/gpt_oss/configuration_gpt_oss.py +21 -24
  582. transformers/models/gpt_oss/modeling_gpt_oss.py +34 -35
  583. transformers/models/gpt_oss/modular_gpt_oss.py +17 -19
  584. transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
  585. transformers/models/gptj/configuration_gptj.py +0 -1
  586. transformers/models/gptj/modeling_gptj.py +82 -81
  587. transformers/models/granite/configuration_granite.py +23 -26
  588. transformers/models/granite/modeling_granite.py +39 -41
  589. transformers/models/granite/modular_granite.py +29 -31
  590. transformers/models/granite_speech/configuration_granite_speech.py +0 -1
  591. transformers/models/granite_speech/feature_extraction_granite_speech.py +1 -3
  592. transformers/models/granite_speech/modeling_granite_speech.py +21 -23
  593. transformers/models/granite_speech/processing_granite_speech.py +11 -4
  594. transformers/models/granitemoe/configuration_granitemoe.py +26 -29
  595. transformers/models/granitemoe/modeling_granitemoe.py +35 -37
  596. transformers/models/granitemoe/modular_granitemoe.py +21 -23
  597. transformers/models/granitemoehybrid/__init__.py +0 -1
  598. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +38 -41
  599. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +60 -64
  600. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +18 -20
  601. transformers/models/granitemoeshared/configuration_granitemoeshared.py +27 -30
  602. transformers/models/granitemoeshared/modeling_granitemoeshared.py +48 -52
  603. transformers/models/granitemoeshared/modular_granitemoeshared.py +19 -21
  604. transformers/models/grounding_dino/configuration_grounding_dino.py +0 -1
  605. transformers/models/grounding_dino/image_processing_grounding_dino.py +60 -62
  606. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +17 -18
  607. transformers/models/grounding_dino/modeling_grounding_dino.py +94 -96
  608. transformers/models/grounding_dino/modular_grounding_dino.py +2 -3
  609. transformers/models/grounding_dino/processing_grounding_dino.py +10 -38
  610. transformers/models/groupvit/configuration_groupvit.py +0 -1
  611. transformers/models/groupvit/modeling_groupvit.py +69 -70
  612. transformers/models/helium/configuration_helium.py +20 -22
  613. transformers/models/helium/modeling_helium.py +33 -36
  614. transformers/models/helium/modular_helium.py +3 -7
  615. transformers/models/herbert/tokenization_herbert.py +4 -6
  616. transformers/models/hgnet_v2/configuration_hgnet_v2.py +0 -1
  617. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -9
  618. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -9
  619. transformers/models/hiera/configuration_hiera.py +0 -1
  620. transformers/models/hiera/modeling_hiera.py +60 -62
  621. transformers/models/hubert/configuration_hubert.py +0 -1
  622. transformers/models/hubert/modeling_hubert.py +35 -37
  623. transformers/models/hubert/modular_hubert.py +8 -11
  624. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +21 -24
  625. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +30 -33
  626. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +3 -5
  627. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +25 -28
  628. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +32 -35
  629. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +5 -7
  630. transformers/models/ibert/configuration_ibert.py +0 -1
  631. transformers/models/ibert/modeling_ibert.py +60 -62
  632. transformers/models/ibert/quant_modules.py +0 -1
  633. transformers/models/idefics/configuration_idefics.py +0 -1
  634. transformers/models/idefics/image_processing_idefics.py +13 -15
  635. transformers/models/idefics/modeling_idefics.py +60 -61
  636. transformers/models/idefics/perceiver.py +1 -3
  637. transformers/models/idefics/processing_idefics.py +32 -48
  638. transformers/models/idefics/vision.py +22 -24
  639. transformers/models/idefics2/configuration_idefics2.py +0 -1
  640. transformers/models/idefics2/image_processing_idefics2.py +31 -32
  641. transformers/models/idefics2/image_processing_idefics2_fast.py +7 -8
  642. transformers/models/idefics2/modeling_idefics2.py +56 -58
  643. transformers/models/idefics2/processing_idefics2.py +10 -68
  644. transformers/models/idefics3/configuration_idefics3.py +0 -1
  645. transformers/models/idefics3/image_processing_idefics3.py +42 -43
  646. transformers/models/idefics3/image_processing_idefics3_fast.py +11 -12
  647. transformers/models/idefics3/modeling_idefics3.py +52 -54
  648. transformers/models/idefics3/processing_idefics3.py +15 -69
  649. transformers/models/ijepa/configuration_ijepa.py +0 -1
  650. transformers/models/ijepa/modeling_ijepa.py +10 -11
  651. transformers/models/ijepa/modular_ijepa.py +5 -7
  652. transformers/models/imagegpt/configuration_imagegpt.py +0 -1
  653. transformers/models/imagegpt/image_processing_imagegpt.py +17 -18
  654. transformers/models/imagegpt/image_processing_imagegpt_fast.py +8 -9
  655. transformers/models/imagegpt/modeling_imagegpt.py +57 -58
  656. transformers/models/informer/configuration_informer.py +6 -9
  657. transformers/models/informer/modeling_informer.py +84 -86
  658. transformers/models/informer/modular_informer.py +13 -16
  659. transformers/models/instructblip/configuration_instructblip.py +0 -1
  660. transformers/models/instructblip/modeling_instructblip.py +43 -44
  661. transformers/models/instructblip/processing_instructblip.py +10 -36
  662. transformers/models/instructblipvideo/configuration_instructblipvideo.py +0 -1
  663. transformers/models/instructblipvideo/modeling_instructblipvideo.py +55 -55
  664. transformers/models/instructblipvideo/modular_instructblipvideo.py +34 -36
  665. transformers/models/instructblipvideo/processing_instructblipvideo.py +14 -33
  666. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +4 -5
  667. transformers/models/internvl/configuration_internvl.py +0 -1
  668. transformers/models/internvl/modeling_internvl.py +41 -43
  669. transformers/models/internvl/modular_internvl.py +19 -21
  670. transformers/models/internvl/processing_internvl.py +12 -45
  671. transformers/models/internvl/video_processing_internvl.py +8 -9
  672. transformers/models/jais2/configuration_jais2.py +20 -22
  673. transformers/models/jais2/modeling_jais2.py +32 -34
  674. transformers/models/jais2/modular_jais2.py +20 -22
  675. transformers/models/jamba/configuration_jamba.py +0 -1
  676. transformers/models/jamba/modeling_jamba.py +43 -46
  677. transformers/models/jamba/modular_jamba.py +37 -38
  678. transformers/models/janus/configuration_janus.py +0 -1
  679. transformers/models/janus/image_processing_janus.py +35 -37
  680. transformers/models/janus/image_processing_janus_fast.py +12 -13
  681. transformers/models/janus/modeling_janus.py +41 -43
  682. transformers/models/janus/modular_janus.py +60 -63
  683. transformers/models/janus/processing_janus.py +17 -43
  684. transformers/models/jetmoe/configuration_jetmoe.py +20 -23
  685. transformers/models/jetmoe/modeling_jetmoe.py +39 -42
  686. transformers/models/jetmoe/modular_jetmoe.py +30 -33
  687. transformers/models/kosmos2/configuration_kosmos2.py +0 -1
  688. transformers/models/kosmos2/modeling_kosmos2.py +145 -146
  689. transformers/models/kosmos2/processing_kosmos2.py +40 -55
  690. transformers/models/kosmos2_5/__init__.py +0 -1
  691. transformers/models/kosmos2_5/configuration_kosmos2_5.py +0 -1
  692. transformers/models/kosmos2_5/image_processing_kosmos2_5.py +10 -12
  693. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -11
  694. transformers/models/kosmos2_5/modeling_kosmos2_5.py +108 -109
  695. transformers/models/kosmos2_5/processing_kosmos2_5.py +8 -29
  696. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +23 -25
  697. transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +12 -14
  698. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +59 -66
  699. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +19 -21
  700. transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +2 -8
  701. transformers/models/lasr/configuration_lasr.py +1 -3
  702. transformers/models/lasr/feature_extraction_lasr.py +10 -12
  703. transformers/models/lasr/modeling_lasr.py +18 -21
  704. transformers/models/lasr/modular_lasr.py +8 -10
  705. transformers/models/lasr/processing_lasr.py +12 -6
  706. transformers/models/lasr/tokenization_lasr.py +2 -4
  707. transformers/models/layoutlm/configuration_layoutlm.py +0 -1
  708. transformers/models/layoutlm/modeling_layoutlm.py +67 -69
  709. transformers/models/layoutlmv2/configuration_layoutlmv2.py +0 -1
  710. transformers/models/layoutlmv2/image_processing_layoutlmv2.py +18 -21
  711. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +5 -6
  712. transformers/models/layoutlmv2/modeling_layoutlmv2.py +48 -50
  713. transformers/models/layoutlmv2/processing_layoutlmv2.py +14 -44
  714. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +63 -74
  715. transformers/models/layoutlmv3/configuration_layoutlmv3.py +0 -1
  716. transformers/models/layoutlmv3/image_processing_layoutlmv3.py +24 -26
  717. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +7 -8
  718. transformers/models/layoutlmv3/modeling_layoutlmv3.py +49 -51
  719. transformers/models/layoutlmv3/processing_layoutlmv3.py +14 -46
  720. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +64 -75
  721. transformers/models/layoutxlm/configuration_layoutxlm.py +0 -1
  722. transformers/models/layoutxlm/modular_layoutxlm.py +0 -1
  723. transformers/models/layoutxlm/processing_layoutxlm.py +14 -44
  724. transformers/models/layoutxlm/tokenization_layoutxlm.py +65 -76
  725. transformers/models/led/configuration_led.py +1 -4
  726. transformers/models/led/modeling_led.py +113 -267
  727. transformers/models/levit/configuration_levit.py +0 -1
  728. transformers/models/levit/image_processing_levit.py +19 -21
  729. transformers/models/levit/image_processing_levit_fast.py +0 -1
  730. transformers/models/levit/modeling_levit.py +17 -19
  731. transformers/models/lfm2/configuration_lfm2.py +22 -23
  732. transformers/models/lfm2/modeling_lfm2.py +42 -44
  733. transformers/models/lfm2/modular_lfm2.py +29 -29
  734. transformers/models/lfm2_moe/__init__.py +0 -1
  735. transformers/models/lfm2_moe/configuration_lfm2_moe.py +1 -2
  736. transformers/models/lfm2_moe/modeling_lfm2_moe.py +44 -45
  737. transformers/models/lfm2_moe/modular_lfm2_moe.py +8 -9
  738. transformers/models/lfm2_vl/configuration_lfm2_vl.py +0 -1
  739. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +34 -5
  740. transformers/models/lfm2_vl/modeling_lfm2_vl.py +31 -33
  741. transformers/models/lfm2_vl/modular_lfm2_vl.py +24 -27
  742. transformers/models/lfm2_vl/processing_lfm2_vl.py +14 -34
  743. transformers/models/lightglue/image_processing_lightglue.py +16 -15
  744. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -4
  745. transformers/models/lightglue/modeling_lightglue.py +28 -30
  746. transformers/models/lightglue/modular_lightglue.py +28 -28
  747. transformers/models/lighton_ocr/__init__.py +28 -0
  748. transformers/models/lighton_ocr/configuration_lighton_ocr.py +128 -0
  749. transformers/models/lighton_ocr/modeling_lighton_ocr.py +460 -0
  750. transformers/models/lighton_ocr/modular_lighton_ocr.py +403 -0
  751. transformers/models/lighton_ocr/processing_lighton_ocr.py +229 -0
  752. transformers/models/lilt/configuration_lilt.py +0 -1
  753. transformers/models/lilt/modeling_lilt.py +53 -55
  754. transformers/models/llama/configuration_llama.py +21 -24
  755. transformers/models/llama/modeling_llama.py +31 -34
  756. transformers/models/llama/tokenization_llama.py +2 -4
  757. transformers/models/llama4/configuration_llama4.py +20 -22
  758. transformers/models/llama4/image_processing_llama4_fast.py +8 -9
  759. transformers/models/llama4/modeling_llama4.py +70 -71
  760. transformers/models/llama4/processing_llama4.py +33 -57
  761. transformers/models/llava/configuration_llava.py +0 -1
  762. transformers/models/llava/image_processing_llava.py +25 -28
  763. transformers/models/llava/image_processing_llava_fast.py +6 -7
  764. transformers/models/llava/modeling_llava.py +35 -37
  765. transformers/models/llava/processing_llava.py +18 -51
  766. transformers/models/llava_next/configuration_llava_next.py +0 -1
  767. transformers/models/llava_next/image_processing_llava_next.py +43 -45
  768. transformers/models/llava_next/image_processing_llava_next_fast.py +5 -6
  769. transformers/models/llava_next/modeling_llava_next.py +42 -44
  770. transformers/models/llava_next/processing_llava_next.py +18 -47
  771. transformers/models/llava_next_video/configuration_llava_next_video.py +0 -1
  772. transformers/models/llava_next_video/modeling_llava_next_video.py +53 -55
  773. transformers/models/llava_next_video/modular_llava_next_video.py +44 -46
  774. transformers/models/llava_next_video/processing_llava_next_video.py +21 -63
  775. transformers/models/llava_next_video/video_processing_llava_next_video.py +0 -1
  776. transformers/models/llava_onevision/configuration_llava_onevision.py +0 -1
  777. transformers/models/llava_onevision/image_processing_llava_onevision.py +40 -42
  778. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +6 -7
  779. transformers/models/llava_onevision/modeling_llava_onevision.py +60 -62
  780. transformers/models/llava_onevision/modular_llava_onevision.py +51 -52
  781. transformers/models/llava_onevision/processing_llava_onevision.py +21 -53
  782. transformers/models/llava_onevision/video_processing_llava_onevision.py +0 -1
  783. transformers/models/longcat_flash/__init__.py +0 -1
  784. transformers/models/longcat_flash/configuration_longcat_flash.py +32 -35
  785. transformers/models/longcat_flash/modeling_longcat_flash.py +30 -31
  786. transformers/models/longcat_flash/modular_longcat_flash.py +17 -19
  787. transformers/models/longformer/configuration_longformer.py +1 -4
  788. transformers/models/longformer/modeling_longformer.py +99 -101
  789. transformers/models/longt5/configuration_longt5.py +0 -1
  790. transformers/models/longt5/modeling_longt5.py +43 -44
  791. transformers/models/luke/configuration_luke.py +0 -1
  792. transformers/models/luke/modeling_luke.py +179 -181
  793. transformers/models/luke/tokenization_luke.py +99 -105
  794. transformers/models/lw_detr/__init__.py +27 -0
  795. transformers/models/lw_detr/configuration_lw_detr.py +374 -0
  796. transformers/models/lw_detr/modeling_lw_detr.py +1698 -0
  797. transformers/models/lw_detr/modular_lw_detr.py +1611 -0
  798. transformers/models/lxmert/configuration_lxmert.py +0 -1
  799. transformers/models/lxmert/modeling_lxmert.py +63 -74
  800. transformers/models/m2m_100/configuration_m2m_100.py +0 -1
  801. transformers/models/m2m_100/modeling_m2m_100.py +69 -71
  802. transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
  803. transformers/models/mamba/configuration_mamba.py +0 -1
  804. transformers/models/mamba/modeling_mamba.py +43 -44
  805. transformers/models/mamba2/configuration_mamba2.py +0 -1
  806. transformers/models/mamba2/modeling_mamba2.py +44 -46
  807. transformers/models/marian/configuration_marian.py +0 -1
  808. transformers/models/marian/modeling_marian.py +84 -86
  809. transformers/models/marian/tokenization_marian.py +6 -6
  810. transformers/models/markuplm/configuration_markuplm.py +0 -1
  811. transformers/models/markuplm/feature_extraction_markuplm.py +1 -2
  812. transformers/models/markuplm/modeling_markuplm.py +60 -62
  813. transformers/models/markuplm/processing_markuplm.py +31 -38
  814. transformers/models/markuplm/tokenization_markuplm.py +67 -77
  815. transformers/models/mask2former/configuration_mask2former.py +4 -7
  816. transformers/models/mask2former/image_processing_mask2former.py +84 -85
  817. transformers/models/mask2former/image_processing_mask2former_fast.py +29 -29
  818. transformers/models/mask2former/modeling_mask2former.py +90 -92
  819. transformers/models/mask2former/modular_mask2former.py +6 -8
  820. transformers/models/maskformer/configuration_maskformer.py +5 -8
  821. transformers/models/maskformer/configuration_maskformer_swin.py +0 -1
  822. transformers/models/maskformer/image_processing_maskformer.py +84 -85
  823. transformers/models/maskformer/image_processing_maskformer_fast.py +28 -29
  824. transformers/models/maskformer/modeling_maskformer.py +56 -58
  825. transformers/models/maskformer/modeling_maskformer_swin.py +18 -20
  826. transformers/models/mbart/configuration_mbart.py +0 -1
  827. transformers/models/mbart/modeling_mbart.py +111 -113
  828. transformers/models/mbart/tokenization_mbart.py +2 -4
  829. transformers/models/mbart50/tokenization_mbart50.py +3 -5
  830. transformers/models/megatron_bert/configuration_megatron_bert.py +0 -1
  831. transformers/models/megatron_bert/modeling_megatron_bert.py +139 -150
  832. transformers/models/metaclip_2/modeling_metaclip_2.py +46 -46
  833. transformers/models/metaclip_2/modular_metaclip_2.py +19 -21
  834. transformers/models/mgp_str/configuration_mgp_str.py +0 -1
  835. transformers/models/mgp_str/modeling_mgp_str.py +14 -16
  836. transformers/models/mgp_str/processing_mgp_str.py +3 -20
  837. transformers/models/mgp_str/tokenization_mgp_str.py +1 -3
  838. transformers/models/mimi/configuration_mimi.py +38 -40
  839. transformers/models/mimi/modeling_mimi.py +76 -79
  840. transformers/models/minimax/__init__.py +0 -1
  841. transformers/models/minimax/configuration_minimax.py +32 -36
  842. transformers/models/minimax/modeling_minimax.py +41 -44
  843. transformers/models/minimax/modular_minimax.py +50 -53
  844. transformers/models/minimax_m2/__init__.py +28 -0
  845. transformers/models/minimax_m2/configuration_minimax_m2.py +211 -0
  846. transformers/models/minimax_m2/modeling_minimax_m2.py +704 -0
  847. transformers/models/minimax_m2/modular_minimax_m2.py +369 -0
  848. transformers/models/ministral/configuration_ministral.py +20 -22
  849. transformers/models/ministral/modeling_ministral.py +31 -33
  850. transformers/models/ministral/modular_ministral.py +27 -29
  851. transformers/models/ministral3/configuration_ministral3.py +19 -22
  852. transformers/models/ministral3/modeling_ministral3.py +31 -33
  853. transformers/models/ministral3/modular_ministral3.py +4 -5
  854. transformers/models/mistral/configuration_mistral.py +19 -22
  855. transformers/models/mistral/modeling_mistral.py +31 -33
  856. transformers/models/mistral/modular_mistral.py +11 -12
  857. transformers/models/mistral3/configuration_mistral3.py +0 -1
  858. transformers/models/mistral3/modeling_mistral3.py +43 -42
  859. transformers/models/mistral3/modular_mistral3.py +35 -35
  860. transformers/models/mixtral/configuration_mixtral.py +24 -27
  861. transformers/models/mixtral/modeling_mixtral.py +35 -38
  862. transformers/models/mixtral/modular_mixtral.py +26 -29
  863. transformers/models/mlcd/configuration_mlcd.py +0 -1
  864. transformers/models/mlcd/modeling_mlcd.py +10 -12
  865. transformers/models/mlcd/modular_mlcd.py +9 -11
  866. transformers/models/mllama/configuration_mllama.py +5 -8
  867. transformers/models/mllama/image_processing_mllama.py +23 -25
  868. transformers/models/mllama/image_processing_mllama_fast.py +5 -6
  869. transformers/models/mllama/modeling_mllama.py +81 -84
  870. transformers/models/mllama/processing_mllama.py +6 -55
  871. transformers/models/mluke/tokenization_mluke.py +97 -103
  872. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +0 -1
  873. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +94 -96
  874. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +0 -1
  875. transformers/models/mobilebert/configuration_mobilebert.py +0 -1
  876. transformers/models/mobilebert/modeling_mobilebert.py +75 -85
  877. transformers/models/mobilebert/tokenization_mobilebert.py +0 -1
  878. transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +0 -1
  879. transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +20 -23
  880. transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +0 -1
  881. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +13 -16
  882. transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +0 -1
  883. transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +48 -51
  884. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +10 -11
  885. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +17 -20
  886. transformers/models/mobilevit/configuration_mobilevit.py +0 -1
  887. transformers/models/mobilevit/image_processing_mobilevit.py +41 -44
  888. transformers/models/mobilevit/image_processing_mobilevit_fast.py +8 -9
  889. transformers/models/mobilevit/modeling_mobilevit.py +17 -19
  890. transformers/models/mobilevitv2/configuration_mobilevitv2.py +0 -1
  891. transformers/models/mobilevitv2/modeling_mobilevitv2.py +17 -20
  892. transformers/models/modernbert/configuration_modernbert.py +34 -34
  893. transformers/models/modernbert/modeling_modernbert.py +123 -125
  894. transformers/models/modernbert/modular_modernbert.py +155 -155
  895. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +30 -32
  896. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +45 -47
  897. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +69 -70
  898. transformers/models/moonshine/configuration_moonshine.py +22 -24
  899. transformers/models/moonshine/modeling_moonshine.py +63 -65
  900. transformers/models/moonshine/modular_moonshine.py +72 -73
  901. transformers/models/moshi/configuration_moshi.py +18 -21
  902. transformers/models/moshi/modeling_moshi.py +130 -133
  903. transformers/models/mpnet/configuration_mpnet.py +0 -1
  904. transformers/models/mpnet/modeling_mpnet.py +55 -57
  905. transformers/models/mpnet/tokenization_mpnet.py +1 -4
  906. transformers/models/mpt/configuration_mpt.py +1 -9
  907. transformers/models/mpt/modeling_mpt.py +58 -60
  908. transformers/models/mra/configuration_mra.py +0 -1
  909. transformers/models/mra/modeling_mra.py +54 -56
  910. transformers/models/mt5/configuration_mt5.py +0 -1
  911. transformers/models/mt5/modeling_mt5.py +75 -77
  912. transformers/models/musicgen/configuration_musicgen.py +0 -1
  913. transformers/models/musicgen/modeling_musicgen.py +108 -111
  914. transformers/models/musicgen/processing_musicgen.py +3 -21
  915. transformers/models/musicgen_melody/configuration_musicgen_melody.py +0 -1
  916. transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +8 -9
  917. transformers/models/musicgen_melody/modeling_musicgen_melody.py +106 -109
  918. transformers/models/musicgen_melody/processing_musicgen_melody.py +3 -22
  919. transformers/models/mvp/configuration_mvp.py +0 -1
  920. transformers/models/mvp/modeling_mvp.py +115 -119
  921. transformers/models/myt5/tokenization_myt5.py +8 -10
  922. transformers/models/nanochat/configuration_nanochat.py +0 -1
  923. transformers/models/nanochat/modeling_nanochat.py +32 -35
  924. transformers/models/nanochat/modular_nanochat.py +12 -14
  925. transformers/models/nemotron/configuration_nemotron.py +20 -23
  926. transformers/models/nemotron/modeling_nemotron.py +49 -52
  927. transformers/models/nllb/tokenization_nllb.py +7 -9
  928. transformers/models/nllb_moe/configuration_nllb_moe.py +0 -1
  929. transformers/models/nllb_moe/modeling_nllb_moe.py +67 -69
  930. transformers/models/nougat/image_processing_nougat.py +29 -32
  931. transformers/models/nougat/image_processing_nougat_fast.py +4 -5
  932. transformers/models/nougat/processing_nougat.py +37 -39
  933. transformers/models/nougat/tokenization_nougat.py +5 -7
  934. transformers/models/nystromformer/configuration_nystromformer.py +0 -1
  935. transformers/models/nystromformer/modeling_nystromformer.py +61 -63
  936. transformers/models/olmo/configuration_olmo.py +18 -21
  937. transformers/models/olmo/modeling_olmo.py +31 -34
  938. transformers/models/olmo/modular_olmo.py +5 -9
  939. transformers/models/olmo2/configuration_olmo2.py +18 -21
  940. transformers/models/olmo2/modeling_olmo2.py +32 -35
  941. transformers/models/olmo2/modular_olmo2.py +29 -31
  942. transformers/models/olmo3/__init__.py +0 -1
  943. transformers/models/olmo3/configuration_olmo3.py +20 -23
  944. transformers/models/olmo3/modeling_olmo3.py +31 -34
  945. transformers/models/olmo3/modular_olmo3.py +31 -33
  946. transformers/models/olmoe/configuration_olmoe.py +24 -26
  947. transformers/models/olmoe/modeling_olmoe.py +37 -39
  948. transformers/models/olmoe/modular_olmoe.py +12 -13
  949. transformers/models/omdet_turbo/configuration_omdet_turbo.py +0 -1
  950. transformers/models/omdet_turbo/modeling_omdet_turbo.py +38 -40
  951. transformers/models/omdet_turbo/processing_omdet_turbo.py +19 -67
  952. transformers/models/oneformer/configuration_oneformer.py +4 -7
  953. transformers/models/oneformer/image_processing_oneformer.py +83 -84
  954. transformers/models/oneformer/image_processing_oneformer_fast.py +33 -34
  955. transformers/models/oneformer/modeling_oneformer.py +123 -124
  956. transformers/models/oneformer/processing_oneformer.py +28 -43
  957. transformers/models/openai/configuration_openai.py +0 -1
  958. transformers/models/openai/modeling_openai.py +50 -51
  959. transformers/models/openai/tokenization_openai.py +2 -5
  960. transformers/models/opt/configuration_opt.py +0 -1
  961. transformers/models/opt/modeling_opt.py +74 -75
  962. transformers/models/ovis2/__init__.py +0 -1
  963. transformers/models/ovis2/configuration_ovis2.py +0 -1
  964. transformers/models/ovis2/image_processing_ovis2.py +22 -24
  965. transformers/models/ovis2/image_processing_ovis2_fast.py +6 -7
  966. transformers/models/ovis2/modeling_ovis2.py +43 -45
  967. transformers/models/ovis2/modular_ovis2.py +30 -32
  968. transformers/models/ovis2/processing_ovis2.py +12 -40
  969. transformers/models/owlv2/configuration_owlv2.py +0 -1
  970. transformers/models/owlv2/image_processing_owlv2.py +20 -21
  971. transformers/models/owlv2/image_processing_owlv2_fast.py +7 -8
  972. transformers/models/owlv2/modeling_owlv2.py +82 -87
  973. transformers/models/owlv2/modular_owlv2.py +6 -7
  974. transformers/models/owlv2/processing_owlv2.py +20 -49
  975. transformers/models/owlvit/configuration_owlvit.py +0 -1
  976. transformers/models/owlvit/image_processing_owlvit.py +21 -22
  977. transformers/models/owlvit/image_processing_owlvit_fast.py +2 -3
  978. transformers/models/owlvit/modeling_owlvit.py +81 -86
  979. transformers/models/owlvit/processing_owlvit.py +20 -48
  980. transformers/models/paddleocr_vl/__init__.py +0 -1
  981. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +19 -19
  982. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +34 -35
  983. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +12 -12
  984. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +76 -76
  985. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +68 -68
  986. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +1 -3
  987. transformers/models/paligemma/configuration_paligemma.py +0 -1
  988. transformers/models/paligemma/modeling_paligemma.py +51 -53
  989. transformers/models/paligemma/processing_paligemma.py +13 -66
  990. transformers/models/parakeet/configuration_parakeet.py +1 -4
  991. transformers/models/parakeet/feature_extraction_parakeet.py +10 -12
  992. transformers/models/parakeet/modeling_parakeet.py +18 -22
  993. transformers/models/parakeet/modular_parakeet.py +16 -18
  994. transformers/models/parakeet/processing_parakeet.py +12 -5
  995. transformers/models/parakeet/tokenization_parakeet.py +2 -4
  996. transformers/models/patchtsmixer/configuration_patchtsmixer.py +5 -8
  997. transformers/models/patchtsmixer/modeling_patchtsmixer.py +60 -62
  998. transformers/models/patchtst/configuration_patchtst.py +6 -9
  999. transformers/models/patchtst/modeling_patchtst.py +72 -74
  1000. transformers/models/pe_audio/__init__.py +0 -1
  1001. transformers/models/pe_audio/configuration_pe_audio.py +14 -16
  1002. transformers/models/pe_audio/feature_extraction_pe_audio.py +6 -8
  1003. transformers/models/pe_audio/modeling_pe_audio.py +26 -27
  1004. transformers/models/pe_audio/modular_pe_audio.py +16 -17
  1005. transformers/models/pe_audio/processing_pe_audio.py +0 -1
  1006. transformers/models/pe_audio_video/__init__.py +0 -1
  1007. transformers/models/pe_audio_video/configuration_pe_audio_video.py +15 -17
  1008. transformers/models/pe_audio_video/modeling_pe_audio_video.py +60 -61
  1009. transformers/models/pe_audio_video/modular_pe_audio_video.py +52 -53
  1010. transformers/models/pe_audio_video/processing_pe_audio_video.py +0 -1
  1011. transformers/models/pe_video/__init__.py +0 -1
  1012. transformers/models/pe_video/configuration_pe_video.py +14 -16
  1013. transformers/models/pe_video/modeling_pe_video.py +21 -22
  1014. transformers/models/pe_video/modular_pe_video.py +11 -12
  1015. transformers/models/pe_video/video_processing_pe_video.py +2 -4
  1016. transformers/models/pegasus/configuration_pegasus.py +0 -1
  1017. transformers/models/pegasus/modeling_pegasus.py +63 -65
  1018. transformers/models/pegasus/tokenization_pegasus.py +1 -4
  1019. transformers/models/pegasus_x/configuration_pegasus_x.py +0 -1
  1020. transformers/models/pegasus_x/modeling_pegasus_x.py +50 -52
  1021. transformers/models/perceiver/configuration_perceiver.py +0 -1
  1022. transformers/models/perceiver/image_processing_perceiver.py +22 -25
  1023. transformers/models/perceiver/image_processing_perceiver_fast.py +5 -6
  1024. transformers/models/perceiver/modeling_perceiver.py +135 -136
  1025. transformers/models/perceiver/tokenization_perceiver.py +3 -6
  1026. transformers/models/perception_lm/configuration_perception_lm.py +0 -1
  1027. transformers/models/perception_lm/image_processing_perception_lm_fast.py +8 -9
  1028. transformers/models/perception_lm/modeling_perception_lm.py +38 -40
  1029. transformers/models/perception_lm/modular_perception_lm.py +31 -33
  1030. transformers/models/perception_lm/processing_perception_lm.py +13 -47
  1031. transformers/models/perception_lm/video_processing_perception_lm.py +0 -1
  1032. transformers/models/persimmon/configuration_persimmon.py +18 -21
  1033. transformers/models/persimmon/modeling_persimmon.py +39 -42
  1034. transformers/models/phi/configuration_phi.py +19 -22
  1035. transformers/models/phi/modeling_phi.py +35 -37
  1036. transformers/models/phi/modular_phi.py +23 -23
  1037. transformers/models/phi3/configuration_phi3.py +23 -26
  1038. transformers/models/phi3/modeling_phi3.py +33 -36
  1039. transformers/models/phi3/modular_phi3.py +13 -17
  1040. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +25 -26
  1041. transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +7 -9
  1042. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +7 -7
  1043. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +54 -56
  1044. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +59 -60
  1045. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +7 -42
  1046. transformers/models/phimoe/configuration_phimoe.py +26 -29
  1047. transformers/models/phimoe/modeling_phimoe.py +35 -38
  1048. transformers/models/phimoe/modular_phimoe.py +0 -1
  1049. transformers/models/phobert/tokenization_phobert.py +4 -6
  1050. transformers/models/pix2struct/configuration_pix2struct.py +0 -1
  1051. transformers/models/pix2struct/image_processing_pix2struct.py +15 -19
  1052. transformers/models/pix2struct/image_processing_pix2struct_fast.py +7 -10
  1053. transformers/models/pix2struct/modeling_pix2struct.py +42 -45
  1054. transformers/models/pix2struct/processing_pix2struct.py +5 -26
  1055. transformers/models/pixio/__init__.py +0 -1
  1056. transformers/models/pixio/configuration_pixio.py +0 -1
  1057. transformers/models/pixio/modeling_pixio.py +7 -9
  1058. transformers/models/pixio/modular_pixio.py +3 -6
  1059. transformers/models/pixtral/configuration_pixtral.py +11 -14
  1060. transformers/models/pixtral/image_processing_pixtral.py +26 -28
  1061. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -6
  1062. transformers/models/pixtral/modeling_pixtral.py +22 -25
  1063. transformers/models/pixtral/processing_pixtral.py +18 -52
  1064. transformers/models/plbart/configuration_plbart.py +0 -1
  1065. transformers/models/plbart/modeling_plbart.py +100 -102
  1066. transformers/models/plbart/modular_plbart.py +30 -32
  1067. transformers/models/plbart/tokenization_plbart.py +4 -5
  1068. transformers/models/poolformer/configuration_poolformer.py +0 -1
  1069. transformers/models/poolformer/image_processing_poolformer.py +21 -24
  1070. transformers/models/poolformer/image_processing_poolformer_fast.py +6 -7
  1071. transformers/models/poolformer/modeling_poolformer.py +10 -12
  1072. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  1073. transformers/models/pop2piano/feature_extraction_pop2piano.py +6 -9
  1074. transformers/models/pop2piano/modeling_pop2piano.py +22 -23
  1075. transformers/models/pop2piano/processing_pop2piano.py +25 -33
  1076. transformers/models/pop2piano/tokenization_pop2piano.py +15 -23
  1077. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +1 -0
  1078. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
  1079. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +14 -15
  1080. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +9 -10
  1081. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +9 -10
  1082. transformers/models/prophetnet/configuration_prophetnet.py +26 -28
  1083. transformers/models/prophetnet/modeling_prophetnet.py +109 -130
  1084. transformers/models/prophetnet/tokenization_prophetnet.py +14 -16
  1085. transformers/models/pvt/configuration_pvt.py +0 -1
  1086. transformers/models/pvt/image_processing_pvt.py +17 -20
  1087. transformers/models/pvt/image_processing_pvt_fast.py +0 -1
  1088. transformers/models/pvt/modeling_pvt.py +19 -21
  1089. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  1090. transformers/models/pvt_v2/modeling_pvt_v2.py +21 -23
  1091. transformers/models/qwen2/configuration_qwen2.py +18 -21
  1092. transformers/models/qwen2/modeling_qwen2.py +31 -33
  1093. transformers/models/qwen2/modular_qwen2.py +11 -12
  1094. transformers/models/qwen2/tokenization_qwen2.py +2 -5
  1095. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +20 -23
  1096. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +135 -128
  1097. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +116 -109
  1098. transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +41 -49
  1099. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +22 -25
  1100. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +94 -96
  1101. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +46 -85
  1102. transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +7 -43
  1103. transformers/models/qwen2_audio/configuration_qwen2_audio.py +0 -1
  1104. transformers/models/qwen2_audio/modeling_qwen2_audio.py +27 -29
  1105. transformers/models/qwen2_audio/processing_qwen2_audio.py +13 -42
  1106. transformers/models/qwen2_moe/configuration_qwen2_moe.py +28 -31
  1107. transformers/models/qwen2_moe/modeling_qwen2_moe.py +36 -39
  1108. transformers/models/qwen2_moe/modular_qwen2_moe.py +7 -10
  1109. transformers/models/qwen2_vl/configuration_qwen2_vl.py +22 -24
  1110. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +38 -40
  1111. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +8 -9
  1112. transformers/models/qwen2_vl/modeling_qwen2_vl.py +91 -92
  1113. transformers/models/qwen2_vl/processing_qwen2_vl.py +7 -44
  1114. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +35 -13
  1115. transformers/models/qwen3/configuration_qwen3.py +20 -23
  1116. transformers/models/qwen3/modeling_qwen3.py +31 -34
  1117. transformers/models/qwen3/modular_qwen3.py +4 -6
  1118. transformers/models/qwen3_moe/configuration_qwen3_moe.py +25 -28
  1119. transformers/models/qwen3_moe/modeling_qwen3_moe.py +36 -39
  1120. transformers/models/qwen3_moe/modular_qwen3_moe.py +10 -13
  1121. transformers/models/qwen3_next/configuration_qwen3_next.py +31 -34
  1122. transformers/models/qwen3_next/modeling_qwen3_next.py +39 -42
  1123. transformers/models/qwen3_next/modular_qwen3_next.py +33 -34
  1124. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +85 -88
  1125. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +107 -110
  1126. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +122 -148
  1127. transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +40 -48
  1128. transformers/models/qwen3_vl/configuration_qwen3_vl.py +16 -19
  1129. transformers/models/qwen3_vl/modeling_qwen3_vl.py +74 -77
  1130. transformers/models/qwen3_vl/modular_qwen3_vl.py +68 -105
  1131. transformers/models/qwen3_vl/processing_qwen3_vl.py +6 -42
  1132. transformers/models/qwen3_vl/video_processing_qwen3_vl.py +10 -12
  1133. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +21 -25
  1134. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +80 -83
  1135. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +33 -36
  1136. transformers/models/rag/configuration_rag.py +0 -1
  1137. transformers/models/rag/modeling_rag.py +116 -118
  1138. transformers/models/rag/retrieval_rag.py +2 -4
  1139. transformers/models/rag/tokenization_rag.py +0 -50
  1140. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +21 -24
  1141. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +31 -34
  1142. transformers/models/reformer/configuration_reformer.py +0 -1
  1143. transformers/models/reformer/modeling_reformer.py +67 -68
  1144. transformers/models/reformer/tokenization_reformer.py +3 -6
  1145. transformers/models/regnet/configuration_regnet.py +0 -1
  1146. transformers/models/regnet/modeling_regnet.py +7 -9
  1147. transformers/models/rembert/configuration_rembert.py +0 -1
  1148. transformers/models/rembert/modeling_rembert.py +108 -110
  1149. transformers/models/rembert/tokenization_rembert.py +1 -4
  1150. transformers/models/resnet/configuration_resnet.py +0 -1
  1151. transformers/models/resnet/modeling_resnet.py +8 -10
  1152. transformers/models/roberta/configuration_roberta.py +0 -1
  1153. transformers/models/roberta/modeling_roberta.py +91 -93
  1154. transformers/models/roberta/modular_roberta.py +55 -58
  1155. transformers/models/roberta/tokenization_roberta.py +2 -5
  1156. transformers/models/roberta/tokenization_roberta_old.py +2 -4
  1157. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +0 -1
  1158. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +91 -93
  1159. transformers/models/roc_bert/configuration_roc_bert.py +0 -1
  1160. transformers/models/roc_bert/modeling_roc_bert.py +119 -121
  1161. transformers/models/roc_bert/tokenization_roc_bert.py +88 -94
  1162. transformers/models/roformer/configuration_roformer.py +0 -1
  1163. transformers/models/roformer/modeling_roformer.py +79 -81
  1164. transformers/models/roformer/tokenization_roformer.py +3 -6
  1165. transformers/models/roformer/tokenization_utils.py +0 -1
  1166. transformers/models/rt_detr/configuration_rt_detr.py +0 -1
  1167. transformers/models/rt_detr/configuration_rt_detr_resnet.py +0 -1
  1168. transformers/models/rt_detr/image_processing_rt_detr.py +54 -55
  1169. transformers/models/rt_detr/image_processing_rt_detr_fast.py +15 -15
  1170. transformers/models/rt_detr/modeling_rt_detr.py +80 -82
  1171. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -4
  1172. transformers/models/rt_detr/modular_rt_detr.py +14 -14
  1173. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +0 -1
  1174. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +79 -81
  1175. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +2 -4
  1176. transformers/models/rwkv/configuration_rwkv.py +0 -1
  1177. transformers/models/rwkv/modeling_rwkv.py +29 -31
  1178. transformers/models/sam/configuration_sam.py +0 -1
  1179. transformers/models/sam/image_processing_sam.py +59 -60
  1180. transformers/models/sam/image_processing_sam_fast.py +21 -22
  1181. transformers/models/sam/modeling_sam.py +33 -35
  1182. transformers/models/sam/processing_sam.py +39 -27
  1183. transformers/models/sam2/configuration_sam2.py +0 -1
  1184. transformers/models/sam2/image_processing_sam2_fast.py +14 -15
  1185. transformers/models/sam2/modeling_sam2.py +45 -47
  1186. transformers/models/sam2/modular_sam2.py +43 -44
  1187. transformers/models/sam2/processing_sam2.py +31 -47
  1188. transformers/models/sam2_video/configuration_sam2_video.py +0 -1
  1189. transformers/models/sam2_video/modeling_sam2_video.py +69 -70
  1190. transformers/models/sam2_video/modular_sam2_video.py +60 -79
  1191. transformers/models/sam2_video/processing_sam2_video.py +49 -66
  1192. transformers/models/sam2_video/video_processing_sam2_video.py +1 -4
  1193. transformers/models/sam3/configuration_sam3.py +0 -1
  1194. transformers/models/sam3/image_processing_sam3_fast.py +17 -20
  1195. transformers/models/sam3/modeling_sam3.py +54 -56
  1196. transformers/models/sam3/modular_sam3.py +3 -8
  1197. transformers/models/sam3/processing_sam3.py +29 -48
  1198. transformers/models/sam3_tracker/__init__.py +0 -1
  1199. transformers/models/sam3_tracker/configuration_sam3_tracker.py +0 -1
  1200. transformers/models/sam3_tracker/modeling_sam3_tracker.py +34 -36
  1201. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  1202. transformers/models/sam3_tracker/processing_sam3_tracker.py +31 -47
  1203. transformers/models/sam3_tracker_video/__init__.py +0 -1
  1204. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +0 -1
  1205. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +70 -70
  1206. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +2 -4
  1207. transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +50 -66
  1208. transformers/models/sam3_video/configuration_sam3_video.py +0 -1
  1209. transformers/models/sam3_video/modeling_sam3_video.py +29 -31
  1210. transformers/models/sam3_video/processing_sam3_video.py +25 -45
  1211. transformers/models/sam_hq/__init__.py +1 -1
  1212. transformers/models/sam_hq/configuration_sam_hq.py +0 -1
  1213. transformers/models/sam_hq/modeling_sam_hq.py +39 -41
  1214. transformers/models/sam_hq/modular_sam_hq.py +17 -19
  1215. transformers/models/sam_hq/{processing_samhq.py → processing_sam_hq.py} +39 -28
  1216. transformers/models/seamless_m4t/configuration_seamless_m4t.py +0 -1
  1217. transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +8 -11
  1218. transformers/models/seamless_m4t/modeling_seamless_m4t.py +180 -182
  1219. transformers/models/seamless_m4t/processing_seamless_m4t.py +18 -39
  1220. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +15 -20
  1221. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +0 -1
  1222. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +193 -195
  1223. transformers/models/seed_oss/configuration_seed_oss.py +23 -25
  1224. transformers/models/seed_oss/modeling_seed_oss.py +30 -32
  1225. transformers/models/seed_oss/modular_seed_oss.py +3 -4
  1226. transformers/models/segformer/configuration_segformer.py +0 -10
  1227. transformers/models/segformer/image_processing_segformer.py +39 -42
  1228. transformers/models/segformer/image_processing_segformer_fast.py +7 -8
  1229. transformers/models/segformer/modeling_segformer.py +24 -26
  1230. transformers/models/segformer/modular_segformer.py +5 -6
  1231. transformers/models/seggpt/configuration_seggpt.py +0 -1
  1232. transformers/models/seggpt/image_processing_seggpt.py +38 -41
  1233. transformers/models/seggpt/modeling_seggpt.py +28 -30
  1234. transformers/models/sew/configuration_sew.py +0 -1
  1235. transformers/models/sew/modeling_sew.py +33 -35
  1236. transformers/models/sew/modular_sew.py +10 -12
  1237. transformers/models/sew_d/configuration_sew_d.py +0 -1
  1238. transformers/models/sew_d/modeling_sew_d.py +28 -30
  1239. transformers/models/shieldgemma2/configuration_shieldgemma2.py +0 -1
  1240. transformers/models/shieldgemma2/modeling_shieldgemma2.py +15 -17
  1241. transformers/models/shieldgemma2/processing_shieldgemma2.py +3 -5
  1242. transformers/models/siglip/configuration_siglip.py +0 -1
  1243. transformers/models/siglip/image_processing_siglip.py +17 -20
  1244. transformers/models/siglip/image_processing_siglip_fast.py +0 -1
  1245. transformers/models/siglip/modeling_siglip.py +38 -39
  1246. transformers/models/siglip/processing_siglip.py +2 -14
  1247. transformers/models/siglip/tokenization_siglip.py +6 -7
  1248. transformers/models/siglip2/configuration_siglip2.py +1 -1
  1249. transformers/models/siglip2/image_processing_siglip2.py +15 -16
  1250. transformers/models/siglip2/image_processing_siglip2_fast.py +4 -5
  1251. transformers/models/siglip2/modeling_siglip2.py +54 -54
  1252. transformers/models/siglip2/modular_siglip2.py +23 -25
  1253. transformers/models/siglip2/processing_siglip2.py +2 -14
  1254. transformers/models/smollm3/configuration_smollm3.py +23 -26
  1255. transformers/models/smollm3/modeling_smollm3.py +31 -34
  1256. transformers/models/smollm3/modular_smollm3.py +27 -29
  1257. transformers/models/smolvlm/configuration_smolvlm.py +1 -1
  1258. transformers/models/smolvlm/image_processing_smolvlm.py +42 -43
  1259. transformers/models/smolvlm/image_processing_smolvlm_fast.py +12 -12
  1260. transformers/models/smolvlm/modeling_smolvlm.py +51 -52
  1261. transformers/models/smolvlm/modular_smolvlm.py +15 -17
  1262. transformers/models/smolvlm/processing_smolvlm.py +15 -76
  1263. transformers/models/smolvlm/video_processing_smolvlm.py +7 -8
  1264. transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +0 -1
  1265. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +20 -23
  1266. transformers/models/speech_to_text/configuration_speech_to_text.py +0 -1
  1267. transformers/models/speech_to_text/feature_extraction_speech_to_text.py +10 -13
  1268. transformers/models/speech_to_text/modeling_speech_to_text.py +52 -54
  1269. transformers/models/speech_to_text/processing_speech_to_text.py +4 -30
  1270. transformers/models/speech_to_text/tokenization_speech_to_text.py +5 -6
  1271. transformers/models/speecht5/configuration_speecht5.py +0 -1
  1272. transformers/models/speecht5/feature_extraction_speecht5.py +16 -37
  1273. transformers/models/speecht5/modeling_speecht5.py +172 -174
  1274. transformers/models/speecht5/number_normalizer.py +0 -1
  1275. transformers/models/speecht5/processing_speecht5.py +3 -37
  1276. transformers/models/speecht5/tokenization_speecht5.py +4 -5
  1277. transformers/models/splinter/configuration_splinter.py +0 -1
  1278. transformers/models/splinter/modeling_splinter.py +54 -56
  1279. transformers/models/splinter/tokenization_splinter.py +2 -4
  1280. transformers/models/squeezebert/configuration_squeezebert.py +0 -1
  1281. transformers/models/squeezebert/modeling_squeezebert.py +60 -62
  1282. transformers/models/squeezebert/tokenization_squeezebert.py +0 -1
  1283. transformers/models/stablelm/configuration_stablelm.py +20 -23
  1284. transformers/models/stablelm/modeling_stablelm.py +39 -42
  1285. transformers/models/starcoder2/configuration_starcoder2.py +19 -22
  1286. transformers/models/starcoder2/modeling_starcoder2.py +33 -36
  1287. transformers/models/starcoder2/modular_starcoder2.py +13 -15
  1288. transformers/models/superglue/configuration_superglue.py +3 -3
  1289. transformers/models/superglue/image_processing_superglue.py +15 -15
  1290. transformers/models/superglue/image_processing_superglue_fast.py +4 -5
  1291. transformers/models/superglue/modeling_superglue.py +32 -33
  1292. transformers/models/superpoint/image_processing_superpoint.py +15 -15
  1293. transformers/models/superpoint/image_processing_superpoint_fast.py +4 -5
  1294. transformers/models/superpoint/modeling_superpoint.py +13 -14
  1295. transformers/models/swiftformer/configuration_swiftformer.py +0 -1
  1296. transformers/models/swiftformer/modeling_swiftformer.py +12 -14
  1297. transformers/models/swin/configuration_swin.py +0 -1
  1298. transformers/models/swin/modeling_swin.py +58 -70
  1299. transformers/models/swin2sr/configuration_swin2sr.py +0 -1
  1300. transformers/models/swin2sr/image_processing_swin2sr.py +10 -13
  1301. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -5
  1302. transformers/models/swin2sr/modeling_swin2sr.py +26 -28
  1303. transformers/models/swinv2/configuration_swinv2.py +0 -1
  1304. transformers/models/swinv2/modeling_swinv2.py +55 -67
  1305. transformers/models/switch_transformers/configuration_switch_transformers.py +0 -1
  1306. transformers/models/switch_transformers/modeling_switch_transformers.py +32 -33
  1307. transformers/models/switch_transformers/modular_switch_transformers.py +29 -30
  1308. transformers/models/t5/configuration_t5.py +0 -1
  1309. transformers/models/t5/modeling_t5.py +75 -77
  1310. transformers/models/t5/tokenization_t5.py +1 -3
  1311. transformers/models/t5gemma/configuration_t5gemma.py +33 -34
  1312. transformers/models/t5gemma/modeling_t5gemma.py +96 -99
  1313. transformers/models/t5gemma/modular_t5gemma.py +117 -118
  1314. transformers/models/t5gemma2/configuration_t5gemma2.py +53 -54
  1315. transformers/models/t5gemma2/modeling_t5gemma2.py +96 -99
  1316. transformers/models/t5gemma2/modular_t5gemma2.py +134 -135
  1317. transformers/models/table_transformer/configuration_table_transformer.py +0 -1
  1318. transformers/models/table_transformer/modeling_table_transformer.py +46 -48
  1319. transformers/models/tapas/configuration_tapas.py +0 -1
  1320. transformers/models/tapas/modeling_tapas.py +64 -66
  1321. transformers/models/tapas/tokenization_tapas.py +115 -153
  1322. transformers/models/textnet/configuration_textnet.py +0 -1
  1323. transformers/models/textnet/image_processing_textnet.py +22 -25
  1324. transformers/models/textnet/image_processing_textnet_fast.py +5 -6
  1325. transformers/models/textnet/modeling_textnet.py +13 -14
  1326. transformers/models/time_series_transformer/configuration_time_series_transformer.py +5 -8
  1327. transformers/models/time_series_transformer/modeling_time_series_transformer.py +79 -81
  1328. transformers/models/timesfm/configuration_timesfm.py +0 -1
  1329. transformers/models/timesfm/modeling_timesfm.py +17 -19
  1330. transformers/models/timesfm/modular_timesfm.py +16 -18
  1331. transformers/models/timesformer/configuration_timesformer.py +0 -1
  1332. transformers/models/timesformer/modeling_timesformer.py +13 -16
  1333. transformers/models/timm_backbone/configuration_timm_backbone.py +0 -1
  1334. transformers/models/timm_backbone/modeling_timm_backbone.py +4 -6
  1335. transformers/models/timm_wrapper/configuration_timm_wrapper.py +2 -3
  1336. transformers/models/timm_wrapper/image_processing_timm_wrapper.py +4 -5
  1337. transformers/models/timm_wrapper/modeling_timm_wrapper.py +13 -15
  1338. transformers/models/trocr/configuration_trocr.py +0 -1
  1339. transformers/models/trocr/modeling_trocr.py +38 -40
  1340. transformers/models/trocr/processing_trocr.py +5 -25
  1341. transformers/models/tvp/configuration_tvp.py +0 -1
  1342. transformers/models/tvp/image_processing_tvp.py +50 -52
  1343. transformers/models/tvp/image_processing_tvp_fast.py +9 -10
  1344. transformers/models/tvp/modeling_tvp.py +25 -27
  1345. transformers/models/tvp/processing_tvp.py +2 -14
  1346. transformers/models/udop/configuration_udop.py +0 -1
  1347. transformers/models/udop/modeling_udop.py +63 -66
  1348. transformers/models/udop/processing_udop.py +7 -26
  1349. transformers/models/udop/tokenization_udop.py +80 -93
  1350. transformers/models/umt5/configuration_umt5.py +0 -1
  1351. transformers/models/umt5/modeling_umt5.py +80 -81
  1352. transformers/models/unispeech/configuration_unispeech.py +0 -1
  1353. transformers/models/unispeech/modeling_unispeech.py +47 -49
  1354. transformers/models/unispeech/modular_unispeech.py +20 -22
  1355. transformers/models/unispeech_sat/configuration_unispeech_sat.py +0 -1
  1356. transformers/models/unispeech_sat/modeling_unispeech_sat.py +63 -65
  1357. transformers/models/unispeech_sat/modular_unispeech_sat.py +21 -23
  1358. transformers/models/univnet/feature_extraction_univnet.py +14 -14
  1359. transformers/models/univnet/modeling_univnet.py +7 -8
  1360. transformers/models/upernet/configuration_upernet.py +0 -1
  1361. transformers/models/upernet/modeling_upernet.py +10 -13
  1362. transformers/models/vaultgemma/__init__.py +0 -1
  1363. transformers/models/vaultgemma/configuration_vaultgemma.py +24 -26
  1364. transformers/models/vaultgemma/modeling_vaultgemma.py +34 -36
  1365. transformers/models/vaultgemma/modular_vaultgemma.py +29 -31
  1366. transformers/models/video_llama_3/image_processing_video_llama_3.py +40 -40
  1367. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +8 -8
  1368. transformers/models/video_llama_3/modeling_video_llama_3.py +66 -66
  1369. transformers/models/video_llama_3/modular_video_llama_3.py +101 -112
  1370. transformers/models/video_llama_3/processing_video_llama_3.py +5 -39
  1371. transformers/models/video_llama_3/video_processing_video_llama_3.py +18 -18
  1372. transformers/models/video_llava/configuration_video_llava.py +0 -1
  1373. transformers/models/video_llava/image_processing_video_llava.py +35 -38
  1374. transformers/models/video_llava/modeling_video_llava.py +52 -54
  1375. transformers/models/video_llava/processing_video_llava.py +38 -78
  1376. transformers/models/video_llava/video_processing_video_llava.py +0 -1
  1377. transformers/models/videomae/configuration_videomae.py +0 -1
  1378. transformers/models/videomae/image_processing_videomae.py +31 -34
  1379. transformers/models/videomae/modeling_videomae.py +13 -15
  1380. transformers/models/videomae/video_processing_videomae.py +0 -1
  1381. transformers/models/vilt/configuration_vilt.py +0 -1
  1382. transformers/models/vilt/image_processing_vilt.py +29 -30
  1383. transformers/models/vilt/image_processing_vilt_fast.py +9 -10
  1384. transformers/models/vilt/modeling_vilt.py +76 -78
  1385. transformers/models/vilt/processing_vilt.py +2 -14
  1386. transformers/models/vipllava/configuration_vipllava.py +0 -1
  1387. transformers/models/vipllava/modeling_vipllava.py +38 -39
  1388. transformers/models/vipllava/modular_vipllava.py +30 -32
  1389. transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +0 -1
  1390. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +18 -21
  1391. transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +0 -1
  1392. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +18 -21
  1393. transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +2 -16
  1394. transformers/models/visual_bert/configuration_visual_bert.py +0 -1
  1395. transformers/models/visual_bert/modeling_visual_bert.py +90 -92
  1396. transformers/models/vit/configuration_vit.py +0 -1
  1397. transformers/models/vit/image_processing_vit.py +19 -22
  1398. transformers/models/vit/image_processing_vit_fast.py +0 -1
  1399. transformers/models/vit/modeling_vit.py +13 -15
  1400. transformers/models/vit_mae/configuration_vit_mae.py +0 -1
  1401. transformers/models/vit_mae/modeling_vit_mae.py +21 -23
  1402. transformers/models/vit_msn/configuration_vit_msn.py +0 -1
  1403. transformers/models/vit_msn/modeling_vit_msn.py +10 -12
  1404. transformers/models/vitdet/configuration_vitdet.py +0 -1
  1405. transformers/models/vitdet/modeling_vitdet.py +12 -14
  1406. transformers/models/vitmatte/configuration_vitmatte.py +1 -4
  1407. transformers/models/vitmatte/image_processing_vitmatte.py +15 -18
  1408. transformers/models/vitmatte/image_processing_vitmatte_fast.py +14 -15
  1409. transformers/models/vitmatte/modeling_vitmatte.py +9 -11
  1410. transformers/models/vitpose/configuration_vitpose.py +3 -6
  1411. transformers/models/vitpose/image_processing_vitpose.py +24 -25
  1412. transformers/models/vitpose/image_processing_vitpose_fast.py +9 -10
  1413. transformers/models/vitpose/modeling_vitpose.py +10 -12
  1414. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +0 -1
  1415. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +8 -10
  1416. transformers/models/vits/configuration_vits.py +0 -1
  1417. transformers/models/vits/modeling_vits.py +34 -35
  1418. transformers/models/vits/tokenization_vits.py +3 -4
  1419. transformers/models/vivit/configuration_vivit.py +0 -1
  1420. transformers/models/vivit/image_processing_vivit.py +36 -39
  1421. transformers/models/vivit/modeling_vivit.py +5 -7
  1422. transformers/models/vjepa2/__init__.py +0 -1
  1423. transformers/models/vjepa2/configuration_vjepa2.py +0 -1
  1424. transformers/models/vjepa2/modeling_vjepa2.py +30 -32
  1425. transformers/models/vjepa2/video_processing_vjepa2.py +0 -1
  1426. transformers/models/voxtral/__init__.py +0 -1
  1427. transformers/models/voxtral/configuration_voxtral.py +0 -1
  1428. transformers/models/voxtral/modeling_voxtral.py +17 -25
  1429. transformers/models/voxtral/modular_voxtral.py +10 -19
  1430. transformers/models/voxtral/processing_voxtral.py +25 -48
  1431. transformers/models/wav2vec2/configuration_wav2vec2.py +0 -1
  1432. transformers/models/wav2vec2/feature_extraction_wav2vec2.py +7 -10
  1433. transformers/models/wav2vec2/modeling_wav2vec2.py +67 -122
  1434. transformers/models/wav2vec2/processing_wav2vec2.py +6 -35
  1435. transformers/models/wav2vec2/tokenization_wav2vec2.py +20 -332
  1436. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +0 -1
  1437. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +49 -52
  1438. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +45 -48
  1439. transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +6 -35
  1440. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +0 -1
  1441. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +62 -65
  1442. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +15 -18
  1443. transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +16 -17
  1444. transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +36 -55
  1445. transformers/models/wavlm/configuration_wavlm.py +0 -1
  1446. transformers/models/wavlm/modeling_wavlm.py +45 -48
  1447. transformers/models/wavlm/modular_wavlm.py +4 -5
  1448. transformers/models/whisper/configuration_whisper.py +0 -1
  1449. transformers/models/whisper/english_normalizer.py +3 -4
  1450. transformers/models/whisper/feature_extraction_whisper.py +9 -24
  1451. transformers/models/whisper/generation_whisper.py +26 -48
  1452. transformers/models/whisper/modeling_whisper.py +68 -70
  1453. transformers/models/whisper/processing_whisper.py +3 -20
  1454. transformers/models/whisper/tokenization_whisper.py +9 -30
  1455. transformers/models/x_clip/configuration_x_clip.py +0 -1
  1456. transformers/models/x_clip/modeling_x_clip.py +68 -69
  1457. transformers/models/x_clip/processing_x_clip.py +2 -14
  1458. transformers/models/xcodec/configuration_xcodec.py +4 -6
  1459. transformers/models/xcodec/modeling_xcodec.py +15 -17
  1460. transformers/models/xglm/configuration_xglm.py +0 -1
  1461. transformers/models/xglm/modeling_xglm.py +49 -55
  1462. transformers/models/xglm/tokenization_xglm.py +1 -4
  1463. transformers/models/xlm/configuration_xlm.py +0 -1
  1464. transformers/models/xlm/modeling_xlm.py +126 -130
  1465. transformers/models/xlm/tokenization_xlm.py +3 -5
  1466. transformers/models/xlm_roberta/configuration_xlm_roberta.py +0 -1
  1467. transformers/models/xlm_roberta/modeling_xlm_roberta.py +90 -92
  1468. transformers/models/xlm_roberta/modular_xlm_roberta.py +50 -53
  1469. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +1 -4
  1470. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +0 -1
  1471. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +91 -93
  1472. transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +67 -70
  1473. transformers/models/xlnet/configuration_xlnet.py +0 -11
  1474. transformers/models/xlnet/modeling_xlnet.py +149 -162
  1475. transformers/models/xlnet/tokenization_xlnet.py +1 -4
  1476. transformers/models/xlstm/configuration_xlstm.py +3 -5
  1477. transformers/models/xlstm/modeling_xlstm.py +62 -65
  1478. transformers/models/xmod/configuration_xmod.py +0 -1
  1479. transformers/models/xmod/modeling_xmod.py +98 -100
  1480. transformers/models/yolos/configuration_yolos.py +0 -1
  1481. transformers/models/yolos/image_processing_yolos.py +60 -62
  1482. transformers/models/yolos/image_processing_yolos_fast.py +18 -18
  1483. transformers/models/yolos/modeling_yolos.py +12 -14
  1484. transformers/models/yolos/modular_yolos.py +2 -4
  1485. transformers/models/yoso/configuration_yoso.py +0 -1
  1486. transformers/models/yoso/modeling_yoso.py +60 -62
  1487. transformers/models/zamba/configuration_zamba.py +0 -1
  1488. transformers/models/zamba/modeling_zamba.py +68 -69
  1489. transformers/models/zamba2/configuration_zamba2.py +36 -37
  1490. transformers/models/zamba2/modeling_zamba2.py +84 -87
  1491. transformers/models/zamba2/modular_zamba2.py +43 -45
  1492. transformers/models/zoedepth/configuration_zoedepth.py +0 -1
  1493. transformers/models/zoedepth/image_processing_zoedepth.py +28 -29
  1494. transformers/models/zoedepth/image_processing_zoedepth_fast.py +11 -12
  1495. transformers/models/zoedepth/modeling_zoedepth.py +14 -16
  1496. transformers/pipelines/__init__.py +50 -49
  1497. transformers/pipelines/any_to_any.py +14 -22
  1498. transformers/pipelines/audio_utils.py +1 -2
  1499. transformers/pipelines/base.py +12 -16
  1500. transformers/pipelines/deprecated/__init__.py +0 -1
  1501. transformers/pipelines/image_text_to_text.py +0 -1
  1502. transformers/pipelines/image_to_text.py +4 -44
  1503. transformers/pipelines/question_answering.py +4 -43
  1504. transformers/pipelines/text_classification.py +1 -14
  1505. transformers/pipelines/token_classification.py +1 -22
  1506. transformers/pipelines/video_classification.py +1 -9
  1507. transformers/pipelines/zero_shot_audio_classification.py +0 -1
  1508. transformers/pipelines/zero_shot_classification.py +0 -6
  1509. transformers/pipelines/zero_shot_image_classification.py +0 -7
  1510. transformers/processing_utils.py +95 -95
  1511. transformers/quantizers/base.py +10 -0
  1512. transformers/quantizers/quantizer_quark.py +0 -1
  1513. transformers/quantizers/quantizer_torchao.py +3 -3
  1514. transformers/testing_utils.py +3 -37
  1515. transformers/tokenization_mistral_common.py +554 -903
  1516. transformers/tokenization_utils_base.py +109 -122
  1517. transformers/tokenization_utils_sentencepiece.py +5 -6
  1518. transformers/tokenization_utils_tokenizers.py +5 -5
  1519. transformers/trainer.py +6 -9
  1520. transformers/trainer_jit_checkpoint.py +1 -2
  1521. transformers/training_args.py +3 -3
  1522. transformers/utils/attention_visualizer.py +1 -1
  1523. transformers/utils/auto_docstring.py +564 -12
  1524. transformers/utils/doc.py +1 -1
  1525. transformers/utils/dummy_pt_objects.py +0 -42
  1526. transformers/utils/generic.py +1 -1
  1527. transformers/utils/loading_report.py +3 -3
  1528. transformers/utils/quantization_config.py +8 -10
  1529. transformers/video_processing_utils.py +19 -20
  1530. transformers/video_utils.py +18 -22
  1531. {transformers-5.0.0rc2.dist-info → transformers-5.0.0rc3.dist-info}/METADATA +19 -19
  1532. transformers-5.0.0rc3.dist-info/RECORD +2067 -0
  1533. transformers-5.0.0rc2.dist-info/RECORD +0 -2042
  1534. {transformers-5.0.0rc2.dist-info → transformers-5.0.0rc3.dist-info}/WHEEL +0 -0
  1535. {transformers-5.0.0rc2.dist-info → transformers-5.0.0rc3.dist-info}/entry_points.txt +0 -0
  1536. {transformers-5.0.0rc2.dist-info → transformers-5.0.0rc3.dist-info}/licenses/LICENSE +0 -0
  1537. {transformers-5.0.0rc2.dist-info → transformers-5.0.0rc3.dist-info}/top_level.txt +0 -0
@@ -14,39 +14,39 @@
14
14
  import os
15
15
  import re
16
16
  import shutil
17
- import warnings
18
- from collections.abc import Callable, Mapping, Sized
17
+ from collections.abc import Callable, Sequence
19
18
  from enum import Enum
20
19
  from pathlib import Path
21
- from typing import Any, Union, overload
20
+ from typing import Any, Literal, Union, overload
22
21
 
23
22
  import numpy as np
24
23
  from huggingface_hub import create_repo
25
24
 
26
25
  from transformers.audio_utils import load_audio_as
27
26
  from transformers.tokenization_utils_base import (
28
- LARGE_INTEGER,
29
27
  VERY_LARGE_INTEGER,
28
+ AddedToken,
30
29
  BatchEncoding,
31
30
  EncodedInput,
32
31
  PreTokenizedInput,
32
+ PreTrainedTokenizerBase,
33
33
  TextInput,
34
34
  TruncationStrategy,
35
35
  )
36
36
  from transformers.utils import PaddingStrategy, TensorType, add_end_docstrings, logging, to_py_obj
37
- from transformers.utils.generic import is_torch_tensor
38
- from transformers.utils.hub import PushToHubMixin
39
37
  from transformers.utils.import_utils import is_mistral_common_available, is_torch_available, requires
40
38
 
41
39
 
42
40
  if is_mistral_common_available():
43
41
  from mistral_common.protocol.instruct.request import ChatCompletionRequest
44
42
  from mistral_common.protocol.instruct.validator import ValidationMode
45
- from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, TokenizerVersion
46
- from mistral_common.tokens.tokenizers.image import MultiModalVersion
43
+ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, SpecialTokens
47
44
  from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
48
45
  from mistral_common.tokens.tokenizers.tekken import Tekkenizer
49
- from mistral_common.tokens.tokenizers.utils import download_tokenizer_from_hf_hub
46
+ from mistral_common.tokens.tokenizers.utils import (
47
+ download_tokenizer_from_hf_hub,
48
+ get_one_valid_tokenizer_file,
49
+ )
50
50
 
51
51
 
52
52
  if is_torch_available():
@@ -103,6 +103,10 @@ ENCODE_KWARGS_DOCSTRING = r"""
103
103
  """
104
104
 
105
105
  ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
106
+ return_token_type_ids (`bool`, *optional*):
107
+ Whether to return token type IDs. For `MistralCommonBackend` it returns a list of zeros of the sequence length as only one sequence is supported.
108
+
109
+ [What are token type IDs?](../glossary#token-type-ids)
106
110
  return_attention_mask (`bool`, *optional*):
107
111
  Whether to return the attention mask. If left to the default, will return the attention mask according
108
112
  to the specific tokenizer's default, defined by the `return_outputs` attribute.
@@ -118,6 +122,8 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
118
122
  Whether or not to return the lengths of the encoded inputs.
119
123
  verbose (`bool`, *optional*, defaults to `True`):
120
124
  Whether or not to print more information and warnings.
125
+ return_offsets_mapping (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
126
+ split_special_tokens (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
121
127
  **kwargs: passed to the `self.tokenize()` method
122
128
 
123
129
  Return:
@@ -149,8 +155,35 @@ class MistralTokenizerType(str, Enum):
149
155
  tekken = "tekken"
150
156
 
151
157
 
158
+ @overload
159
+ def _maybe_remove_lang(text: str, skip_special_tokens: bool) -> str: ...
160
+ @overload
161
+ def _maybe_remove_lang(text: list[str], skip_special_tokens: bool) -> list[str]: ...
162
+ def _maybe_remove_lang(text: str | list[str], skip_special_tokens: bool) -> str | list[str]:
163
+ # in the specific case of Voxtral, the added f"lang:xx" (always a two char language code since it follows ISO 639-1 alpha-2 format)
164
+ # is not considered as a special token by mistral-common and is encoded/ decoded as normal text.
165
+ # Nevertheless we should remove it to ease users life.
166
+ if not skip_special_tokens:
167
+ return text
168
+
169
+ if isinstance(text, str):
170
+ return re.sub(r"^lang:[a-z]{2}", "", text)
171
+
172
+ return [re.sub(r"^lang:[a-z]{2}", "", string) for string in text]
173
+
174
+
175
+ _MAP_SPECIAL_TOKENS = {
176
+ "bos_token": SpecialTokens.bos.value,
177
+ "eos_token": SpecialTokens.eos.value,
178
+ "pad_token": SpecialTokens.pad.value,
179
+ "unk_token": SpecialTokens.unk.value,
180
+ }
181
+
182
+ _VALID_INIT_KWARGS = {"_from_auto", "backend", "files_loaded"}
183
+
184
+
152
185
  @requires(backends=("mistral-common",))
153
- class MistralCommonBackend(PushToHubMixin):
186
+ class MistralCommonBackend(PreTrainedTokenizerBase):
154
187
  """
155
188
  Class to wrap `mistral-common` tokenizers.
156
189
 
@@ -165,34 +198,13 @@ class MistralCommonBackend(PushToHubMixin):
165
198
  For more info on `mistral-common`, see [mistral-common](https://github.com/mistralai/mistral-common).
166
199
 
167
200
  This class is a wrapper around a `mistral_common.tokens.tokenizers.mistral.MistralTokenizer`.
168
- It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer.
169
-
170
- Supports the following methods from the `PreTrainedTokenizerBase` class:
171
-
172
- - [`~MistralCommonBackend.get_vocab`]: Returns the vocabulary as a dictionary of token to index.
173
- This is a lossy conversion for Tekkenizer as some decoding errors are collapsed into the same token.
174
- - [`~MistralCommonBackend.encode`]: Encode a string to a list of integers.
175
- - [`~MistralCommonBackend.decode`]: Decode a list of integers to a string.
176
- - [`~MistralCommonBackend.batch_decode`]: Decode a batch of list of integers to a list of strings.
177
- - [`~MistralCommonBackend.convert_tokens_to_ids`]: Convert a list of tokens to a list of integers.
178
- - [`~MistralCommonBackend.convert_ids_to_tokens`]: Convert a list of integers to a list of tokens.
179
- - [`~MistralCommonBackend.tokenize`]: Tokenize a string.
180
- - [`~MistralCommonBackend.get_special_tokens_mask`]: Get the special tokens mask for a list of tokens.
181
- - [`~MistralCommonBackend.prepare_for_model`]: Prepare a list of inputs for the model.
182
- - [`~MistralCommonBackend.pad`]: Pad a list of inputs to the same length.
183
- - [`~MistralCommonBackend.truncate_sequences`]: Truncate a list of sequences to the same length.
184
- - [`~MistralCommonBackend.apply_chat_template`]: Apply a chat template to a list of messages.
185
- - [`~MistralCommonBackend.__call__`]: Tokenize a string or a list of strings.
186
- - [`~MistralCommonBackend.from_pretrained`]: Download and cache a pretrained tokenizer from the Hugging Face model hub or local directory.
187
- - [`~MistralCommonBackend.save_pretrained`]: Save a tokenizer to a directory, so it can be reloaded using the `from_pretrained` class method.
188
- - [`~MistralCommonBackend.push_to_hub`]: Upload tokenizer to the Hugging Face model hub.
189
-
190
- Here are the key differences with the `PreTrainedTokenizerBase` class:
191
-
192
- - Pair of sequences are not supported. The signature have been kept for compatibility but all arguments related to pair of sequences are ignored. The return values of pairs are returned as `None`.
201
+ It provides a Hugging Face compatible interface to tokenize using the official mistral-common tokenizer and inherits from the `PreTrainedTokenizerBase` class.
202
+
203
+ Here are the key behavior differences with the `PythonBackend` class:
204
+
205
+ - Pair of sequences are not supported. The signature has been kept for compatibility but all arguments related to pair of sequences are ignored. The return values for pairs are returned as `None`.
193
206
  - The `is_split_into_words` argument is not supported.
194
- - The `return_token_type_ids` argument is not supported.
195
- - It is not possible to add new tokens to the tokenizer. Also the special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("<s>")` will not return the ID of the `<s>` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `"<s>"`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens).
207
+ - It is not possible to add new tokens to the tokenizer. Special tokens are handled differently from Transformers. In `mistral-common`, special tokens are never encoded directly. This means that: `tokenizer.encode("<s>")` will not return the ID of the `<s>` token. Instead, it will return a list of IDs corresponding to the tokenization of the string `"<s>"`. For more information, see the [mistral-common documentation](https://mistralai.github.io/mistral-common/usage/tokenizers/#special-tokens).
196
208
 
197
209
  If you have suggestions to improve this class, please open an issue on the [mistral-common GitHub repository](https://github.com/mistralai/mistral-common/issues) if it is related to the tokenizer or on the [Transformers GitHub repository](https://github.com/huggingface/transformers/issues) if it is related to the Hugging Face interface.
198
210
  """
@@ -200,6 +212,12 @@ class MistralCommonBackend(PushToHubMixin):
200
212
  model_input_names: list[str] = ["input_ids", "attention_mask"]
201
213
  padding_side: str = "left"
202
214
  truncation_side: str = "right"
215
+ SPECIAL_TOKENS_ATTRIBUTES = [
216
+ "bos_token",
217
+ "eos_token",
218
+ "unk_token",
219
+ "pad_token",
220
+ ]
203
221
 
204
222
  def __init__(
205
223
  self,
@@ -226,7 +244,7 @@ class MistralCommonBackend(PushToHubMixin):
226
244
  Path to the tokenizer file to load the `MistralTokenizer`.
227
245
  mode (`Union[str, ValidationMode]`, *optional*, defaults to `ValidationMode.test`):
228
246
  The mode to use for the tokenizer. This will be passed to the `MistralTokenizer` constructor. Possible values are:
229
- - `"finetuning"` or `ValidationMode.finetuning`: The finetuning mode.
247
+ - `"finetuning"` or `ValidationMode.finetuning`: The fine-tuning mode.
230
248
  - `"test"` or `ValidationMode.test`: The test mode.
231
249
  It changes how the tokenizer validates the input and prepares the request to the model.
232
250
  model_max_length (`int`, *optional*):
@@ -240,60 +258,40 @@ class MistralCommonBackend(PushToHubMixin):
240
258
  truncation_side (`str`, *optional*):
241
259
  The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
242
260
  Default value is picked from the class attribute of the same name.
243
- model_input_names (`List[string]`, *optional*):
261
+ model_input_names (`List[str]`, *optional*):
244
262
  The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
245
263
  `"attention_mask"`). Default value is picked from the class attribute of the same name.
246
264
  clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
247
- Whether or not the model should cleanup the spaces that were added when splitting the input text during the
265
+ Whether or not the model should clean up the spaces that were added when splitting the input text during the
248
266
  tokenization process.
249
267
  """
250
- if kwargs:
268
+ if kwargs and not set(kwargs.keys()).issubset(_VALID_INIT_KWARGS):
251
269
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported to init `MistralCommonBackend`.")
252
270
 
253
271
  self._tokenizer_path = Path(tokenizer_path)
254
272
  self._mode = self._get_validation_mode(mode)
273
+
255
274
  self.tokenizer: MistralTokenizer = MistralTokenizer.from_file(str(self._tokenizer_path), mode=self._mode)
256
275
  self._tokenizer_type = (
257
276
  MistralTokenizerType.tekken
258
277
  if isinstance(self.tokenizer.instruct_tokenizer.tokenizer, Tekkenizer)
259
278
  else MistralTokenizerType.spm
260
279
  )
261
- self.truncation_side = truncation_side
262
- self.padding_side = padding_side
263
- self.model_max_length = model_max_length
264
- self.cleanup_tokenization_spaces = clean_up_tokenization_spaces
265
- self.deprecation_warnings = {} # Use to store when we have already noticed a deprecation warning (avoid overlogging).
266
- self._all_special_tokens_ids = self._get_all_special_ids()
267
-
268
- if model_input_names is not None:
269
- if (
270
- not isinstance(model_input_names, (list, tuple))
271
- and len(model_input_names) == 0
272
- and not all(isinstance(i, str) for i in model_input_names)
273
- ):
274
- raise ValueError(
275
- "`model_input_names` should be a non-empty list or tuple of str but got an empty value."
276
- )
277
- self.model_input_names = model_input_names
278
-
279
280
  self._cache_get_vocab: dict[str, int] | None = None
280
281
 
281
- @staticmethod
282
- def clean_up_tokenization(text: str) -> str:
283
- """
284
- Clean up a list of simple English tokenization artifacts like spaces before punctuation.
285
- """
286
- return (
287
- text.replace(" .", ".")
288
- .replace(" ?", "?")
289
- .replace(" !", "!")
290
- .replace(" ,", ",")
291
- .replace(" ' ", "'")
292
- .replace(" n't", "n't")
293
- .replace(" 'm", "'m")
294
- .replace(" 's", "'s")
295
- .replace(" 've", "'ve")
296
- .replace(" 're", "'re")
282
+ self._all_special_ids = self._get_all_special_ids()
283
+ self._all_special_tokens = self.convert_ids_to_tokens(self.all_special_ids)
284
+
285
+ super().__init__(
286
+ truncation_side=truncation_side,
287
+ padding_side=padding_side,
288
+ model_max_length=model_max_length,
289
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
290
+ extra_special_tokens=None, # Not used by this backend.
291
+ model_specific_special_tokens=None, # Not used by this backend.
292
+ model_input_names=model_input_names or self.model_input_names,
293
+ **_MAP_SPECIAL_TOKENS,
294
+ **kwargs,
297
295
  )
298
296
 
299
297
  @property
@@ -306,75 +304,19 @@ class MistralCommonBackend(PushToHubMixin):
306
304
  """
307
305
  return self._mode
308
306
 
309
- @property
310
- def bos_token_id(self) -> int:
311
- """
312
- Id of the beginning of sentence token in the vocabulary.
313
- """
314
- return self.tokenizer.instruct_tokenizer.tokenizer.bos_id
315
-
316
- @property
317
- def eos_token_id(self) -> int:
318
- """
319
- Id of the end of sentence token in the vocabulary.
320
- """
321
- return self.tokenizer.instruct_tokenizer.tokenizer.eos_id
322
-
323
- @property
324
- def unk_token_id(self) -> int:
325
- """
326
- Id of the unknown token in the vocabulary.
327
- """
328
- return self.tokenizer.instruct_tokenizer.tokenizer.unk_id
329
-
330
- @property
331
- def pad_token_id(self) -> int:
332
- """
333
- Id of the padding token in the vocabulary.
334
- """
335
- return self.tokenizer.instruct_tokenizer.tokenizer.pad_id
336
-
337
- @property
338
- def bos_token(self) -> str:
339
- """
340
- String associated to the beginning of sentence token in the vocabulary.
341
- """
342
- return self.convert_ids_to_tokens(self.bos_token_id)
343
-
344
- @property
345
- def eos_token(self) -> str:
346
- """
347
- String associated to the end of sentence token in the vocabulary.
348
- """
349
- return self.convert_ids_to_tokens(self.eos_token_id)
350
-
351
- @property
352
- def unk_token(self) -> str:
353
- """
354
- String associated to the unknown token in the vocabulary.
355
- """
356
- return self.convert_ids_to_tokens(self.unk_token_id)
357
-
358
- @property
359
- def pad_token(self) -> str:
360
- """
361
- String associated to the padding token in the vocabulary.
362
- """
363
- return self.convert_ids_to_tokens(self.pad_token_id)
364
-
365
307
  @property
366
308
  def all_special_ids(self) -> list[int]:
367
309
  """
368
310
  `list[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.).
369
311
  """
370
- return sorted(self._all_special_tokens_ids)
312
+ return sorted(self._all_special_ids)
371
313
 
372
314
  @property
373
315
  def all_special_tokens(self) -> list[str]:
374
316
  """
375
317
  `list[str]`: A list of all unique special tokens.
376
318
  """
377
- return self.convert_ids_to_tokens(self.all_special_ids)
319
+ return self._all_special_tokens
378
320
 
379
321
  @property
380
322
  def vocab_size(self) -> int:
@@ -435,6 +377,8 @@ class MistralCommonBackend(PushToHubMixin):
435
377
  padding_side: str | None = None,
436
378
  return_tensors: str | TensorType | None = None,
437
379
  verbose: bool = True,
380
+ return_offsets_mapping: Literal[False] = False,
381
+ split_special_tokens: Literal[False] = False,
438
382
  **kwargs,
439
383
  ) -> list[int]:
440
384
  """
@@ -446,37 +390,81 @@ class MistralCommonBackend(PushToHubMixin):
446
390
  text_pair (`None`, *optional*):
447
391
  Not supported by `MistralCommonBackend.encode`. Kept to match `PreTrainedTokenizerBase.encode` signature.
448
392
  """
393
+ if return_offsets_mapping or split_special_tokens:
394
+ raise ValueError(
395
+ "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
396
+ )
397
+
398
+ if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND, "only_first", "only_second"]:
399
+ raise ValueError(
400
+ "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
401
+ )
402
+
449
403
  if kwargs:
450
404
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.encode`.")
405
+
451
406
  if text_pair:
452
407
  raise ValueError("`MistralCommonBackend.encode` does not support `text_pair`.")
453
408
 
454
- padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
409
+ return super().encode(
410
+ text=text,
411
+ text_pair=text_pair,
412
+ add_special_tokens=add_special_tokens,
455
413
  padding=padding,
456
414
  truncation=truncation,
457
415
  max_length=max_length,
458
- pad_to_multiple_of=pad_to_multiple_of,
459
- verbose=verbose,
460
- )
461
-
462
- encoded_inputs = self._encode_plus(
463
- text,
464
- add_special_tokens=add_special_tokens,
465
- padding_strategy=padding_strategy,
466
- truncation_strategy=truncation_strategy,
467
- max_length=max_length,
468
416
  stride=stride,
417
+ return_tensors=return_tensors,
469
418
  pad_to_multiple_of=pad_to_multiple_of,
470
419
  padding_side=padding_side,
471
- return_tensors=return_tensors,
472
- return_attention_mask=False,
473
- return_overflowing_tokens=False,
474
- return_special_tokens_mask=False,
475
- return_length=False,
476
420
  verbose=verbose,
477
421
  )
478
422
 
479
- return encoded_inputs["input_ids"]
423
+ def _decode(
424
+ self,
425
+ token_ids: int | list[int],
426
+ skip_special_tokens: bool = False,
427
+ clean_up_tokenization_spaces: bool | None = None,
428
+ **kwargs,
429
+ ) -> str:
430
+ if kwargs:
431
+ raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.decode`.")
432
+
433
+ token_ids = to_py_obj(token_ids)
434
+
435
+ if isinstance(token_ids, int):
436
+ token_ids = [token_ids]
437
+
438
+ special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
439
+
440
+ text = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
441
+
442
+ # Apply tokenizer-specific cleanup if available and requested
443
+ clean_up_tokenization_spaces = (
444
+ clean_up_tokenization_spaces
445
+ if clean_up_tokenization_spaces is not None
446
+ else self.clean_up_tokenization_spaces
447
+ )
448
+ if clean_up_tokenization_spaces:
449
+ # Call custom cleanup method if it exists (e.g., for CLVP's [SPACE] token replacement)
450
+ if hasattr(self, "clean_up_tokenization") and callable(self.clean_up_tokenization):
451
+ text = self.clean_up_tokenization(text)
452
+ else:
453
+ # Otherwise apply standard cleanup
454
+ text = (
455
+ text.replace(" .", ".")
456
+ .replace(" ?", "?")
457
+ .replace(" !", "!")
458
+ .replace(" ,", ",")
459
+ .replace(" ' ", "'")
460
+ .replace(" n't", "n't")
461
+ .replace(" 'm", "'m")
462
+ .replace(" 's", "'s")
463
+ .replace(" 've", "'ve")
464
+ .replace(" 're", "'re")
465
+ )
466
+
467
+ return _maybe_remove_lang(text=text, skip_special_tokens=skip_special_tokens)
480
468
 
481
469
  def decode(
482
470
  self,
@@ -484,7 +472,7 @@ class MistralCommonBackend(PushToHubMixin):
484
472
  skip_special_tokens: bool = False,
485
473
  clean_up_tokenization_spaces: bool | None = None,
486
474
  **kwargs,
487
- ) -> Union[str, list[str]]:
475
+ ) -> str | list[str]:
488
476
  """
489
477
  Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
490
478
  tokens and clean up tokenization spaces.
@@ -509,16 +497,7 @@ class MistralCommonBackend(PushToHubMixin):
509
497
  if kwargs:
510
498
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.decode`.")
511
499
 
512
- token_ids = to_py_obj(token_ids)
513
-
514
- if isinstance(token_ids, (list, tuple)) and len(token_ids) > 0 and isinstance(token_ids[0], (list, tuple)):
515
- return self._batch_decode(
516
- sequences=token_ids,
517
- skip_special_tokens=skip_special_tokens,
518
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
519
- )
520
-
521
- return self._decode(
500
+ return super().decode(
522
501
  token_ids=token_ids,
523
502
  skip_special_tokens=skip_special_tokens,
524
503
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
@@ -555,63 +534,12 @@ class MistralCommonBackend(PushToHubMixin):
555
534
  if kwargs:
556
535
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.batch_decode`.")
557
536
 
558
- return self._batch_decode(
537
+ return super().batch_decode(
559
538
  sequences=sequences,
560
539
  skip_special_tokens=skip_special_tokens,
561
540
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
562
541
  )
563
542
 
564
- def _decode(
565
- self,
566
- token_ids: Union[int, list[int], list[list[int]], np.ndarray, "torch.Tensor"],
567
- skip_special_tokens: bool = False,
568
- clean_up_tokenization_spaces: bool | None = None,
569
- ) -> str:
570
- clean_up_tokenization_spaces = clean_up_tokenization_spaces or self.cleanup_tokenization_spaces
571
-
572
- # Convert inputs to python lists
573
- if isinstance(token_ids, int):
574
- token_ids = [token_ids]
575
-
576
- token_ids = to_py_obj(token_ids)
577
-
578
- special_token_policy = SpecialTokenPolicy.IGNORE if skip_special_tokens else SpecialTokenPolicy.KEEP
579
-
580
- decoded_string = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
581
- if clean_up_tokenization_spaces:
582
- decoded_string = self.clean_up_tokenization(decoded_string)
583
-
584
- # in the specific case of Voxtral, the added f"lang:xx" (always a two char language code since it follows ISO 639-1 alpha-2 format)
585
- # is not considered as a special token by mistral-common and is encoded/ decoded as normal text.
586
- # Nevertheless we should remove it to ease users life.
587
- if skip_special_tokens:
588
- decoded_string = re.sub(r"^lang:[a-z]{2}", "", decoded_string)
589
-
590
- return decoded_string
591
-
592
- def _batch_decode(
593
- self,
594
- sequences: Union[list[int], list[list[int]], np.ndarray, "torch.Tensor"],
595
- skip_special_tokens: bool = False,
596
- clean_up_tokenization_spaces: bool | None = None,
597
- ) -> list[str]:
598
- return [
599
- self._decode(
600
- seq,
601
- skip_special_tokens=skip_special_tokens,
602
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
603
- )
604
- for seq in sequences
605
- ]
606
-
607
- def _is_control_token(self, token_id: int) -> bool:
608
- if self._tokenizer_type == MistralTokenizerType.spm:
609
- return token_id in self.tokenizer.instruct_tokenizer.tokenizer._control_tokens
610
- elif self._tokenizer_type == MistralTokenizerType.tekken:
611
- return token_id < self.tokenizer.instruct_tokenizer.tokenizer.num_special_tokens
612
- else:
613
- raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
614
-
615
543
  @overload
616
544
  def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
617
545
  @overload
@@ -632,22 +560,22 @@ class MistralCommonBackend(PushToHubMixin):
632
560
  """
633
561
 
634
562
  if isinstance(ids, int):
635
- one_token = True
563
+ return_int = True
636
564
  ids = [ids]
637
565
  else:
638
- one_token = False
566
+ return_int = False
639
567
 
640
568
  tokens: list[str] = []
641
569
  for token_id in ids:
642
- if self._is_control_token(token_id) and skip_special_tokens:
570
+ if self.tokenizer.instruct_tokenizer.tokenizer.is_special(token_id) and skip_special_tokens:
643
571
  continue
644
572
  tokens.append(self.tokenizer.instruct_tokenizer.tokenizer.id_to_piece(token_id))
645
573
 
646
- if one_token:
647
- if tokens == []:
648
- raise ValueError(f"Invalid token id {ids}.")
649
-
574
+ if return_int and tokens == []:
575
+ raise ValueError(f"Invalid token id {ids[0]}.")
576
+ elif return_int:
650
577
  return tokens[0]
578
+
651
579
  return tokens
652
580
 
653
581
  def _tekken_piece_to_id(self, piece: str, warn: bool) -> int:
@@ -708,7 +636,13 @@ class MistralCommonBackend(PushToHubMixin):
708
636
  tokens_ids = self.tokenizer.instruct_tokenizer.tokenizer.encode(text, bos=add_special_tokens, eos=add_eos)
709
637
  return tokens_ids
710
638
 
711
- def tokenize(self, text: TextInput, **kwargs) -> list[str]:
639
+ def tokenize(
640
+ self,
641
+ text: TextInput,
642
+ return_offsets_mapping: Literal[False] = False,
643
+ split_special_tokens: Literal[False] = False,
644
+ **kwargs,
645
+ ) -> list[str]:
712
646
  """
713
647
  Converts a string into a sequence of tokens, using the tokenizer.
714
648
 
@@ -717,6 +651,8 @@ class MistralCommonBackend(PushToHubMixin):
717
651
  Args:
718
652
  text (`str`):
719
653
  The sequence to be encoded.
654
+ return_offsets_mapping (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
655
+ split_special_tokens (`Literal[False]`, *optional*): False, kept to match Transformers' signature.
720
656
  **kwargs (additional keyword arguments):
721
657
  Not supported by `MistralCommonBackend.tokenize`.
722
658
  Will raise an error if used.
@@ -724,40 +660,164 @@ class MistralCommonBackend(PushToHubMixin):
724
660
  Returns:
725
661
  `list[str]`: The list of tokens.
726
662
  """
663
+ if return_offsets_mapping or split_special_tokens:
664
+ raise ValueError(
665
+ "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
666
+ )
667
+
727
668
  if kwargs:
728
669
  raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.tokenize`.")
729
670
 
730
671
  return self.convert_ids_to_tokens(self._text_to_ids(text, add_special_tokens=False), skip_special_tokens=False)
731
672
 
732
- def _encode_plus(
673
+ def _get_all_special_ids(self) -> set[int]:
674
+ if self._tokenizer_type == MistralTokenizerType.tekken:
675
+ return self.tokenizer.instruct_tokenizer.tokenizer._special_token_ids
676
+ elif self._tokenizer_type == MistralTokenizerType.spm:
677
+ return {
678
+ token_id
679
+ for token_id in range(self.tokenizer.instruct_tokenizer.tokenizer.n_words)
680
+ if self.tokenizer.instruct_tokenizer.tokenizer.is_special(token_id)
681
+ }
682
+ else:
683
+ raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
684
+
685
+ def get_special_tokens_mask(
686
+ self, token_ids_0: list[int], token_ids_1: None = None, already_has_special_tokens: bool = False
687
+ ) -> list[int]:
688
+ """
689
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
690
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
691
+
692
+ Args:
693
+ token_ids_0 (`list[int]`): List of ids of the sequence.
694
+ token_ids_1 (`None`, *optional*): None, kept to match Transformers' implementation.
695
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
696
+ Whether or not the token list is already formatted with special tokens for the model.
697
+
698
+ Returns:
699
+ A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
700
+ """
701
+ if token_ids_1 is not None:
702
+ raise ValueError(
703
+ "`token_ids_1` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
704
+ )
705
+
706
+ if already_has_special_tokens:
707
+ return [1 if int(token_id) in self._all_special_ids else 0 for token_id in token_ids_0]
708
+
709
+ if self.mode == ValidationMode.test:
710
+ # [BOS] seq0
711
+ return [1] + ([0] * len(token_ids_0))
712
+ else:
713
+ # [BOS] seq0 [EOS]
714
+ return [1] + ([0] * len(token_ids_0)) + [1]
715
+
716
+ def _encode_plus( # type: ignore[override]
733
717
  self,
734
- text: TextInput | EncodedInput,
718
+ text: TextInput | PreTokenizedInput | EncodedInput,
719
+ text_pair: None = None,
735
720
  add_special_tokens: bool = True,
736
721
  padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
737
722
  truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
738
723
  max_length: int | None = None,
739
724
  stride: int = 0,
725
+ is_split_into_words: bool = False,
740
726
  pad_to_multiple_of: int | None = None,
741
727
  padding_side: str | None = None,
742
728
  return_tensors: str | TensorType | None = None,
729
+ return_token_type_ids: bool | None = None,
743
730
  return_attention_mask: bool | None = None,
744
731
  return_overflowing_tokens: bool = False,
745
732
  return_special_tokens_mask: bool = False,
746
733
  return_length: bool = False,
747
734
  verbose: bool = True,
735
+ return_offsets_mapping: Literal[False] = False,
736
+ split_special_tokens: Literal[False] = False,
737
+ **kwargs,
748
738
  ) -> BatchEncoding:
739
+ # Detect batched inputs (list of sequences)
740
+ if text_pair is not None:
741
+ raise ValueError("`MistralCommonBackend` does not support `text_pair != None` for `_encode_plus`.")
742
+
743
+ if return_offsets_mapping or split_special_tokens:
744
+ raise ValueError(
745
+ "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
746
+ )
747
+
748
+ if kwargs:
749
+ raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend._encode_plus`.")
750
+
751
+ is_batched = isinstance(text, (list, tuple)) and (
752
+ (not text and not is_split_into_words)
753
+ or (text and is_split_into_words and isinstance(text[0], (list, tuple)))
754
+ or (text and not is_split_into_words and isinstance(text[0], (str, list, tuple)))
755
+ )
756
+
757
+ if is_batched:
758
+ batch_outputs = {}
759
+ one_overflowed = False
760
+ for current_text in text:
761
+ current_output = self._encode_plus(
762
+ text=current_text,
763
+ text_pair=None,
764
+ add_special_tokens=add_special_tokens,
765
+ padding_strategy=PaddingStrategy.DO_NOT_PAD, # we pad in batch afterward
766
+ truncation_strategy=truncation_strategy,
767
+ max_length=max_length,
768
+ stride=stride,
769
+ is_split_into_words=is_split_into_words,
770
+ pad_to_multiple_of=None, # we pad in batch afterward
771
+ padding_side=None, # we pad in batch afterward
772
+ return_tensors=None, # We convert the whole batch to tensors at the end
773
+ return_token_type_ids=return_token_type_ids,
774
+ return_attention_mask=False, # we pad in batch afterward
775
+ return_overflowing_tokens=return_overflowing_tokens,
776
+ return_special_tokens_mask=return_special_tokens_mask,
777
+ return_length=return_length,
778
+ verbose=verbose,
779
+ )
780
+ for key, value in current_output.items():
781
+ batch_outputs.setdefault(key, []).append(value)
782
+
783
+ # To ensure the list is built for each sample, we need to add this.
784
+ if return_overflowing_tokens and not return_tensors:
785
+ if "overflowing_tokens" not in current_output:
786
+ batch_outputs.setdefault("overflowing_tokens", []).append([0])
787
+ batch_outputs.setdefault("num_truncated_tokens", []).append([0])
788
+ else:
789
+ one_overflowed = True
790
+
791
+ # Remove overflow-related keys before tensor conversion if return_tensors is set
792
+ # Slow tokenizers don't support returning these as tensors
793
+ if return_overflowing_tokens and (return_tensors or not one_overflowed):
794
+ batch_outputs.pop("overflowing_tokens", None)
795
+ batch_outputs.pop("num_truncated_tokens", None)
796
+
797
+ batch_outputs = self.pad(
798
+ batch_outputs,
799
+ padding=padding_strategy.value,
800
+ max_length=max_length,
801
+ pad_to_multiple_of=pad_to_multiple_of,
802
+ padding_side=padding_side,
803
+ return_attention_mask=return_attention_mask,
804
+ )
805
+
806
+ return BatchEncoding(batch_outputs, tensor_type=return_tensors)
807
+
749
808
  def get_input_ids(text):
750
809
  if isinstance(text, str):
751
- return self._text_to_ids(text, add_special_tokens)
810
+ return self._text_to_ids(text, False)
752
811
  elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
753
812
  return text
754
813
  else:
755
814
  raise ValueError(f"Input {text} is not valid. Should be a string, or a list/tuple of integers.")
756
815
 
757
- ids = get_input_ids(text)
816
+ first_ids = get_input_ids(text)
758
817
 
759
818
  return self.prepare_for_model(
760
- ids,
819
+ first_ids,
820
+ pair_ids=None,
761
821
  add_special_tokens=add_special_tokens,
762
822
  padding=padding_strategy.value,
763
823
  truncation=truncation_strategy.value,
@@ -768,202 +828,62 @@ class MistralCommonBackend(PushToHubMixin):
768
828
  return_tensors=return_tensors,
769
829
  prepend_batch_axis=True,
770
830
  return_attention_mask=return_attention_mask,
831
+ return_token_type_ids=return_token_type_ids,
771
832
  return_overflowing_tokens=return_overflowing_tokens,
772
833
  return_special_tokens_mask=return_special_tokens_mask,
773
834
  return_length=return_length,
774
835
  verbose=verbose,
775
836
  )
776
837
 
777
- def _batch_encode_plus(
838
+ @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
839
+ def prepare_for_model(
778
840
  self,
779
- batch_text: list[TextInput] | list[EncodedInput],
841
+ ids: list[int],
842
+ pair_ids: None = None,
780
843
  add_special_tokens: bool = True,
781
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
782
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
844
+ padding: bool | str | PaddingStrategy = False,
845
+ truncation: bool | str | TruncationStrategy | None = None,
783
846
  max_length: int | None = None,
784
847
  stride: int = 0,
785
848
  pad_to_multiple_of: int | None = None,
786
849
  padding_side: str | None = None,
787
850
  return_tensors: str | TensorType | None = None,
851
+ return_token_type_ids: bool | None = None,
788
852
  return_attention_mask: bool | None = None,
789
853
  return_overflowing_tokens: bool = False,
790
854
  return_special_tokens_mask: bool = False,
791
855
  return_length: bool = False,
792
856
  verbose: bool = True,
857
+ prepend_batch_axis: bool = False,
858
+ return_offsets_mapping: Literal[False] = False,
859
+ split_special_tokens: Literal[False] = False,
860
+ **kwargs,
793
861
  ) -> BatchEncoding:
794
- def get_input_ids(text):
795
- if isinstance(text, str):
796
- return self._text_to_ids(text, add_special_tokens)
797
- elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
798
- return text
799
- else:
800
- raise ValueError("Input is not valid. Should be a string or a list/tuple of integers.")
801
-
802
- input_ids = []
803
- for ids in batch_text:
804
- input_ids.append(get_input_ids(ids))
805
-
806
- batch_outputs = self._batch_prepare_for_model(
807
- input_ids,
808
- add_special_tokens=add_special_tokens,
809
- padding_strategy=padding_strategy,
810
- truncation_strategy=truncation_strategy,
811
- max_length=max_length,
812
- stride=stride,
813
- pad_to_multiple_of=pad_to_multiple_of,
814
- padding_side=padding_side,
815
- return_attention_mask=return_attention_mask,
816
- return_overflowing_tokens=return_overflowing_tokens,
817
- return_special_tokens_mask=return_special_tokens_mask,
818
- return_length=return_length,
819
- return_tensors=return_tensors,
820
- verbose=verbose,
821
- )
822
-
823
- return BatchEncoding(batch_outputs)
824
-
825
- def _get_all_special_ids(self) -> set[int]:
826
- if self._tokenizer_type == MistralTokenizerType.tekken:
827
- return {t["rank"] for t in self.tokenizer.instruct_tokenizer.tokenizer._all_special_tokens}
828
- elif self._tokenizer_type == MistralTokenizerType.spm:
829
- return self.tokenizer.instruct_tokenizer.tokenizer._control_tokens
830
- else:
831
- raise ValueError(f"Unknown tokenizer type: {self._tokenizer_type}")
832
-
833
- def get_special_tokens_mask(
834
- self, token_ids_0: list, token_ids_1: None = None, already_has_special_tokens: bool = False
835
- ) -> list[int]:
836
862
  """
837
- Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
838
- special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
863
+ Prepares a sequence of input id so that it can be used by the model. It
864
+ adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
865
+ manages a moving window (with user defined stride) for overflowing tokens.
839
866
 
840
867
  Args:
841
- token_ids_0 (`list[int]`):
842
- List of ids of the sequence.
843
- token_ids_1 (`list[int]`, *optional*):
868
+ ids (`list[int]`):
869
+ Tokenized input ids of the first sequence.
870
+ pair_ids (`None`, *optional*):
844
871
  Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
845
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
846
- Whether or not the token list is already formatted with special tokens for the model.
847
-
848
- Returns:
849
- A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
850
872
  """
851
- if token_ids_1 is not None:
873
+ if return_offsets_mapping or split_special_tokens:
852
874
  raise ValueError(
853
- "`token_ids_1` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
875
+ "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
854
876
  )
855
- if already_has_special_tokens:
877
+
878
+ if pair_ids is not None:
856
879
  raise ValueError(
857
- "`already_has_special_tokens` is not supported by `MistralCommonBackend` and should be `False`."
880
+ "`pair_ids` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
858
881
  )
859
882
 
860
- special_tokens_mask = [1 if token in self._all_special_tokens_ids else 0 for token in token_ids_0]
861
- return special_tokens_mask
862
-
863
- def _batch_prepare_for_model(
864
- self,
865
- batch_ids: list[PreTokenizedInput | list[int]],
866
- add_special_tokens: bool = True,
867
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
868
- truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
869
- max_length: int | None = None,
870
- stride: int = 0,
871
- pad_to_multiple_of: int | None = None,
872
- padding_side: str | None = None,
873
- return_tensors: str | None = None,
874
- return_attention_mask: bool | None = None,
875
- return_overflowing_tokens: bool = False,
876
- return_special_tokens_mask: bool = False,
877
- return_length: bool = False,
878
- verbose: bool = True,
879
- ) -> BatchEncoding:
880
- """
881
- Prepares a sequence of input id so that it can be used by the model. It
882
- adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
883
- manages a moving window (with user defined stride) for overflowing tokens.
884
-
885
- Args:
886
- batch_ids: list of tokenized input ids
887
- """
888
-
889
- batch_outputs = {}
890
- for ids in batch_ids:
891
- outputs = self.prepare_for_model(
892
- ids,
893
- add_special_tokens=add_special_tokens,
894
- padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
895
- truncation=truncation_strategy.value,
896
- max_length=max_length,
897
- stride=stride,
898
- pad_to_multiple_of=None, # we pad in batch afterward
899
- padding_side=None, # we pad in batch afterward
900
- return_attention_mask=False, # we pad in batch afterward
901
- return_overflowing_tokens=return_overflowing_tokens,
902
- return_special_tokens_mask=return_special_tokens_mask,
903
- return_length=return_length,
904
- return_tensors=None, # We convert the whole batch to tensors at the end
905
- prepend_batch_axis=False,
906
- verbose=verbose,
907
- )
908
-
909
- for key, value in outputs.items():
910
- if key not in batch_outputs:
911
- batch_outputs[key] = []
912
- batch_outputs[key].append(value)
913
-
914
- batch_outputs = self.pad(
915
- batch_outputs,
916
- padding=padding_strategy.value,
917
- max_length=max_length,
918
- pad_to_multiple_of=pad_to_multiple_of,
919
- padding_side=padding_side,
920
- return_attention_mask=return_attention_mask,
921
- )
922
-
923
- batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
924
-
925
- return batch_outputs
926
-
927
- @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
928
- def prepare_for_model(
929
- self,
930
- ids: list[int],
931
- pair_ids: None = None,
932
- add_special_tokens: bool = True,
933
- padding: bool | str | PaddingStrategy = False,
934
- truncation: bool | str | TruncationStrategy | None = None,
935
- max_length: int | None = None,
936
- stride: int = 0,
937
- pad_to_multiple_of: int | None = None,
938
- padding_side: str | None = None,
939
- return_tensors: str | TensorType | None = None,
940
- return_attention_mask: bool | None = None,
941
- return_overflowing_tokens: bool = False,
942
- return_special_tokens_mask: bool = False,
943
- return_length: bool = False,
944
- verbose: bool = True,
945
- prepend_batch_axis: bool = False,
946
- **kwargs,
947
- ) -> BatchEncoding:
948
- """
949
- Prepares a sequence of input id so that it can be used by the model. It
950
- adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
951
- manages a moving window (with user defined stride) for overflowing tokens.
952
-
953
- Args:
954
- ids (`list[int]`):
955
- Tokenized input ids of the first sequence.
956
- pair_ids (`None`, *optional*):
957
- Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
958
- """
959
- if pair_ids is not None:
960
- raise ValueError(
961
- "`pair_ids` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
962
- )
963
- if kwargs:
964
- raise ValueError(
965
- f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.prepare_for_model`."
966
- )
883
+ if kwargs:
884
+ raise ValueError(
885
+ f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.prepare_for_model`."
886
+ )
967
887
 
968
888
  padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
969
889
  padding=padding,
@@ -971,39 +891,65 @@ class MistralCommonBackend(PushToHubMixin):
971
891
  max_length=max_length,
972
892
  pad_to_multiple_of=pad_to_multiple_of,
973
893
  verbose=verbose,
894
+ **kwargs,
974
895
  )
975
896
 
976
- len_ids = len(ids)
897
+ # Validation
898
+ if (
899
+ return_overflowing_tokens
900
+ and truncation_strategy == TruncationStrategy.LONGEST_FIRST
901
+ and pair_ids is not None
902
+ ):
903
+ raise ValueError(
904
+ "Not possible to return overflowing tokens for pair of sequences with the "
905
+ "`longest_first`. Please select another truncation strategy than `longest_first`, "
906
+ "for instance `only_second` or `only_first`."
907
+ )
977
908
 
978
- # Load from model defaults
909
+ # Defaults
910
+ if return_token_type_ids is None:
911
+ return_token_type_ids = "token_type_ids" in self.model_input_names
979
912
  if return_attention_mask is None:
980
913
  return_attention_mask = "attention_mask" in self.model_input_names
981
914
 
982
- encoded_inputs = {}
915
+ # Truncation
916
+ num_special = self.num_special_tokens_to_add(pair=False) if add_special_tokens else 0
917
+ total_len = len(ids) + len(pair_ids or []) + num_special
983
918
 
984
- # Truncation: Handle max sequence length
985
919
  overflowing_tokens = []
986
- if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and len_ids > max_length:
920
+ if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
987
921
  ids, _, overflowing_tokens = self.truncate_sequences(
988
922
  ids,
989
- num_tokens_to_remove=len_ids - max_length,
923
+ pair_ids=None,
924
+ num_tokens_to_remove=total_len - max_length,
990
925
  truncation_strategy=truncation_strategy,
991
926
  stride=stride,
992
927
  )
993
928
 
994
- if return_overflowing_tokens:
995
- encoded_inputs["overflowing_tokens"] = overflowing_tokens
996
- encoded_inputs["num_truncated_tokens"] = len_ids - max_length
929
+ # Add special tokens
930
+ if add_special_tokens:
931
+ sequence = self.build_inputs_with_special_tokens(ids, None)
932
+ token_type_ids = self.create_token_type_ids_from_sequences(ids, None)
933
+ else:
934
+ sequence = ids
935
+ token_type_ids = [0] * len(sequence)
997
936
 
998
- # Build output dictionary
999
- encoded_inputs[self.model_input_names[0]] = ids
937
+ # Build output
938
+ encoded_inputs = {"input_ids": sequence}
939
+ if return_token_type_ids:
940
+ encoded_inputs["token_type_ids"] = token_type_ids
1000
941
  if return_special_tokens_mask:
1001
- if add_special_tokens:
1002
- encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, None)
1003
- else:
1004
- encoded_inputs["special_tokens_mask"] = [0] * len(ids)
942
+ encoded_inputs["special_tokens_mask"] = (
943
+ self.get_special_tokens_mask(ids, None) if add_special_tokens else [0] * len(sequence)
944
+ )
945
+ if return_overflowing_tokens and not return_tensors and overflowing_tokens:
946
+ encoded_inputs["overflowing_tokens"] = overflowing_tokens
947
+ encoded_inputs["num_truncated_tokens"] = total_len - max_length if max_length else 0
948
+
949
+ # Check sequence length and warn if needed
950
+ self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
1005
951
 
1006
- # Padding
952
+ # Pad
1007
953
  if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
1008
954
  encoded_inputs = self.pad(
1009
955
  encoded_inputs,
@@ -1017,362 +963,9 @@ class MistralCommonBackend(PushToHubMixin):
1017
963
  if return_length:
1018
964
  encoded_inputs["length"] = len(encoded_inputs["input_ids"])
1019
965
 
1020
- batch_outputs = BatchEncoding(
1021
- encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
1022
- )
1023
-
1024
- return batch_outputs
1025
-
1026
- def _get_padding_truncation_strategies(
1027
- self,
1028
- padding: str | PaddingStrategy | bool = False,
1029
- truncation: str | TruncationStrategy | bool | None = None,
1030
- max_length: int | None = None,
1031
- pad_to_multiple_of: int | None = None,
1032
- verbose: bool = True,
1033
- **kwargs,
1034
- ):
1035
- """
1036
- Find the correct padding/truncation strategy.
1037
- """
1038
-
1039
- # Backward compatibility for previous behavior, maybe we should deprecate it:
1040
- # If you only set max_length, it activates truncation for max_length
1041
- if max_length is not None and padding is False and truncation is None:
1042
- if verbose:
1043
- if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
1044
- logger.warning(
1045
- "Truncation was not explicitly activated but `max_length` is provided a specific value, please"
1046
- " use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
1047
- " 'longest_first' truncation strategy."
1048
- )
1049
- self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
1050
- truncation = "longest_first"
1051
-
1052
- # Get padding strategy
1053
- if padding is not False:
1054
- if padding is True:
1055
- if verbose:
1056
- if max_length is not None and (
1057
- truncation is None or truncation is False or truncation == "do_not_truncate"
1058
- ):
1059
- warnings.warn(
1060
- "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
1061
- "To pad to max length, use `padding='max_length'`."
1062
- )
1063
- padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
1064
- elif not isinstance(padding, PaddingStrategy):
1065
- padding_strategy = PaddingStrategy(padding)
1066
- elif isinstance(padding, PaddingStrategy):
1067
- padding_strategy = padding
1068
- else:
1069
- padding_strategy = PaddingStrategy.DO_NOT_PAD
1070
-
1071
- # Get truncation strategy
1072
- if truncation is not False and truncation is not None:
1073
- if truncation is True:
1074
- truncation_strategy = (
1075
- TruncationStrategy.LONGEST_FIRST
1076
- ) # Default to truncate the longest sequences in pairs of inputs
1077
- elif not isinstance(truncation, TruncationStrategy):
1078
- truncation_strategy = TruncationStrategy(truncation)
1079
- elif isinstance(truncation, TruncationStrategy):
1080
- truncation_strategy = truncation
1081
- if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]:
1082
- raise ValueError(
1083
- "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
1084
- )
1085
- else:
1086
- truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
1087
-
1088
- # Set max length if needed
1089
- if max_length is None:
1090
- if padding_strategy == PaddingStrategy.MAX_LENGTH:
1091
- if self.model_max_length > LARGE_INTEGER:
1092
- if verbose:
1093
- if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
1094
- logger.warning(
1095
- "Asking to pad to max_length but no maximum length is provided and the model has no"
1096
- " predefined maximum length. Default to no padding."
1097
- )
1098
- self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
1099
- padding_strategy = PaddingStrategy.DO_NOT_PAD
1100
- else:
1101
- max_length = self.model_max_length
1102
-
1103
- if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
1104
- if self.model_max_length > LARGE_INTEGER:
1105
- if verbose:
1106
- if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
1107
- logger.warning(
1108
- "Asking to truncate to max_length but no maximum length is provided and the model has"
1109
- " no predefined maximum length. Default to no truncation."
1110
- )
1111
- self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
1112
- truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
1113
- else:
1114
- max_length = self.model_max_length
1115
-
1116
- # Test if we have a padding token
1117
- if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token_id is None or self.pad_token_id < 0):
1118
- raise ValueError(
1119
- "Asking to pad but the tokenizer does not have a padding token. "
1120
- "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
1121
- "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
1122
- )
1123
-
1124
- # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
1125
- if (
1126
- truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
1127
- and padding_strategy != PaddingStrategy.DO_NOT_PAD
1128
- and pad_to_multiple_of is not None
1129
- and max_length is not None
1130
- and (max_length % pad_to_multiple_of != 0)
1131
- ):
1132
- raise ValueError(
1133
- "Truncation and padding are both activated but "
1134
- f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
1135
- )
1136
-
1137
- return padding_strategy, truncation_strategy, max_length, kwargs
1138
-
1139
- def _pad(
1140
- self,
1141
- encoded_inputs: dict[str, EncodedInput] | BatchEncoding,
1142
- max_length: int | None = None,
1143
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
1144
- pad_to_multiple_of: int | None = None,
1145
- padding_side: str | None = None,
1146
- return_attention_mask: bool | None = None,
1147
- ) -> dict:
1148
- """
1149
- Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
1150
-
1151
- Args:
1152
- encoded_inputs:
1153
- Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
1154
- max_length: maximum length of the returned list and optionally padding length (see below).
1155
- Will truncate by taking into account the special tokens.
1156
- padding_strategy: PaddingStrategy to use for padding.
1157
-
1158
- - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
1159
- - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
1160
- - PaddingStrategy.DO_NOT_PAD: Do not pad
1161
- The tokenizer padding sides are defined in `padding_side` argument:
1162
-
1163
- - 'left': pads on the left of the sequences
1164
- - 'right': pads on the right of the sequences
1165
- pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
1166
- This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
1167
- `>= 7.5` (Volta).
1168
- padding_side:
1169
- The side on which the model should have padding applied. Should be selected between ['right', 'left'].
1170
- Default value is picked from the class attribute of the same name.
1171
- return_attention_mask:
1172
- (optional) Set to False to avoid returning attention mask (default: set to model specifics)
1173
- """
1174
- # Load from model defaults
1175
- if return_attention_mask is None:
1176
- return_attention_mask = "attention_mask" in self.model_input_names
1177
-
1178
- required_input = encoded_inputs[self.model_input_names[0]]
1179
-
1180
- if padding_strategy == PaddingStrategy.LONGEST:
1181
- max_length = len(required_input)
1182
-
1183
- if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
1184
- max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
1185
-
1186
- needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
1187
-
1188
- # Initialize attention mask if not present.
1189
- if return_attention_mask and "attention_mask" not in encoded_inputs:
1190
- encoded_inputs["attention_mask"] = [1] * len(required_input)
1191
-
1192
- if needs_to_be_padded:
1193
- difference = max_length - len(required_input)
1194
- padding_side = padding_side if padding_side is not None else self.padding_side
1195
-
1196
- if padding_side == "right":
1197
- if return_attention_mask:
1198
- encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
1199
- if "special_tokens_mask" in encoded_inputs:
1200
- encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
1201
- encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
1202
- elif padding_side == "left":
1203
- if return_attention_mask:
1204
- encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
1205
- if "special_tokens_mask" in encoded_inputs:
1206
- encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
1207
- encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
1208
- else:
1209
- raise ValueError(f"Invalid padding strategy:{padding_side}")
1210
-
1211
- return encoded_inputs
966
+ return BatchEncoding(encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis)
1212
967
 
1213
- def pad(
1214
- self,
1215
- encoded_inputs: BatchEncoding
1216
- | list[BatchEncoding]
1217
- | dict[str, EncodedInput]
1218
- | dict[str, list[EncodedInput]]
1219
- | list[dict[str, EncodedInput]],
1220
- padding: bool | str | PaddingStrategy = True,
1221
- max_length: int | None = None,
1222
- pad_to_multiple_of: int | None = None,
1223
- padding_side: str | None = None,
1224
- return_attention_mask: bool | None = None,
1225
- return_tensors: str | TensorType | None = None,
1226
- verbose: bool = True,
1227
- ) -> BatchEncoding:
1228
- """
1229
- Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
1230
- in the batch.
1231
-
1232
- Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
1233
- `self.pad_token_id`).
1234
- <Tip>
1235
-
1236
- If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors, the
1237
- result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
1238
- PyTorch tensors, you will lose the specific device of your tensors however.
1239
-
1240
- </Tip>
1241
-
1242
- Args:
1243
- encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, list[int]]`, `Dict[str, list[list[int]]` or `List[Dict[str, list[int]]]`):
1244
- Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, list[int]]`) or a batch of
1245
- tokenized inputs (list of [`BatchEncoding`], *Dict[str, list[list[int]]]* or *List[Dict[str,
1246
- list[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
1247
- collate function.
1248
-
1249
- Instead of `list[int]` you can have tensors (numpy arrays, PyTorch tensors), see
1250
- the note above for the return type.
1251
- padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
1252
- Select a strategy to pad the returned sequences (according to the model's padding side and padding
1253
- index) among:
1254
-
1255
- - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
1256
- sequence if provided).
1257
- - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
1258
- acceptable input length for the model if that argument is not provided.
1259
- - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different
1260
- lengths).
1261
- max_length (`int`, *optional*):
1262
- Maximum length of the returned list and optionally padding length (see above).
1263
- pad_to_multiple_of (`int`, *optional*):
1264
- If set will pad the sequence to a multiple of the provided value.
1265
-
1266
- This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
1267
- `>= 7.5` (Volta).
1268
- padding_side (`str`, *optional*):
1269
- The side on which the model should have padding applied. Should be selected between ['right', 'left'].
1270
- Default value is picked from the class attribute of the same name.
1271
- return_attention_mask (`bool`, *optional*):
1272
- Whether to return the attention mask. If left to the default, will return the attention mask according
1273
- to the specific tokenizer's default, defined by the `return_outputs` attribute.
1274
-
1275
- [What are attention masks?](../glossary#attention-mask)
1276
- return_tensors (`str` or [`~utils.TensorType`], *optional*):
1277
- If set, will return tensors instead of list of python integers. Acceptable values are:
1278
-
1279
- - `'pt'`: Return PyTorch `torch.Tensor` objects.
1280
- - `'np'`: Return Numpy `np.ndarray` objects.
1281
- verbose (`bool`, *optional*, defaults to `True`):
1282
- Whether or not to print more information and warnings.
1283
- """
1284
- # If we have a list of dicts, let's convert it in a dict of lists
1285
- # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
1286
- if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
1287
- # Call .keys() explicitly for compatibility with TensorDict and other Mapping subclasses
1288
- encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
1289
-
1290
- # The model's main input name, usually `input_ids`, has been passed for padding
1291
- if self.model_input_names[0] not in encoded_inputs:
1292
- raise ValueError(
1293
- "You should supply an encoding or a list of encodings to this method "
1294
- f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
1295
- )
1296
-
1297
- required_input = encoded_inputs[self.model_input_names[0]]
1298
-
1299
- if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
1300
- if return_attention_mask:
1301
- encoded_inputs["attention_mask"] = []
1302
- return encoded_inputs
1303
-
1304
- # If we have PyTorch/NumPy tensors/arrays as inputs, we cast them as python objects
1305
- # and rebuild them afterwards if no return_tensors is specified
1306
- # Note that we lose the specific device the tensor may be on for PyTorch
1307
-
1308
- first_element = required_input[0]
1309
- if isinstance(first_element, (list, tuple)):
1310
- # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
1311
- for item in required_input:
1312
- if len(item) != 0:
1313
- first_element = item[0]
1314
- break
1315
- # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
1316
- if not isinstance(first_element, (int, list, tuple)):
1317
- if is_torch_tensor(first_element):
1318
- return_tensors = "pt" if return_tensors is None else return_tensors
1319
- elif isinstance(first_element, np.ndarray):
1320
- return_tensors = "np" if return_tensors is None else return_tensors
1321
- else:
1322
- raise ValueError(
1323
- f"type of {first_element} unknown: {type(first_element)}. "
1324
- "Should be one of a python, numpy, or pytorch object."
1325
- )
1326
-
1327
- for key, value in encoded_inputs.items():
1328
- encoded_inputs[key] = to_py_obj(value)
1329
-
1330
- # Convert padding_strategy in PaddingStrategy
1331
- padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
1332
- padding=padding, max_length=max_length, verbose=verbose
1333
- )
1334
-
1335
- required_input = encoded_inputs[self.model_input_names[0]]
1336
- if required_input and not isinstance(required_input[0], (list, tuple)):
1337
- encoded_inputs = self._pad(
1338
- encoded_inputs,
1339
- max_length=max_length,
1340
- padding_strategy=padding_strategy,
1341
- pad_to_multiple_of=pad_to_multiple_of,
1342
- padding_side=padding_side,
1343
- return_attention_mask=return_attention_mask,
1344
- )
1345
- return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
1346
-
1347
- batch_size = len(required_input)
1348
- assert all(len(v) == batch_size for v in encoded_inputs.values()), (
1349
- "Some items in the output dictionary have a different batch size than others."
1350
- )
1351
-
1352
- if padding_strategy == PaddingStrategy.LONGEST:
1353
- max_length = max(len(inputs) for inputs in required_input)
1354
- padding_strategy = PaddingStrategy.MAX_LENGTH
1355
-
1356
- batch_outputs = {}
1357
- for i in range(batch_size):
1358
- inputs = {k: v[i] for k, v in encoded_inputs.items()}
1359
- outputs = self._pad(
1360
- inputs,
1361
- max_length=max_length,
1362
- padding_strategy=padding_strategy,
1363
- pad_to_multiple_of=pad_to_multiple_of,
1364
- padding_side=padding_side,
1365
- return_attention_mask=return_attention_mask,
1366
- )
1367
-
1368
- for key, value in outputs.items():
1369
- if key not in batch_outputs:
1370
- batch_outputs[key] = []
1371
- batch_outputs[key].append(value)
1372
-
1373
- return BatchEncoding(batch_outputs, tensor_type=return_tensors)
1374
-
1375
- def truncate_sequences(
968
+ def truncate_sequences( # type: ignore[override]
1376
969
  self,
1377
970
  ids: list[int],
1378
971
  pair_ids: None = None,
@@ -1407,47 +1000,36 @@ class MistralCommonBackend(PushToHubMixin):
1407
1000
  `Tuple[list[int], None, list[int]]`: The truncated `ids` and the list of
1408
1001
  overflowing tokens. `None` is returned to match Transformers signature.
1409
1002
  """
1410
- if kwargs:
1411
- raise ValueError(
1412
- f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.truncate_sequences`."
1413
- )
1003
+
1414
1004
  if pair_ids:
1415
1005
  raise ValueError("`pair_ids` is not supported by `MistralCommonBackend.truncate_sequences`.")
1416
1006
 
1417
- if num_tokens_to_remove <= 0:
1418
- return (ids, None, [])
1419
-
1420
1007
  if not isinstance(truncation_strategy, TruncationStrategy):
1421
1008
  truncation_strategy = TruncationStrategy(truncation_strategy)
1422
1009
 
1423
- if truncation_strategy in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]:
1424
- raise ValueError(
1425
- f"Only {TruncationStrategy.LONGEST_FIRST} and {TruncationStrategy.DO_NOT_TRUNCATE} are supported."
1426
- )
1010
+ if truncation_strategy in [
1011
+ TruncationStrategy.ONLY_FIRST,
1012
+ TruncationStrategy.ONLY_SECOND,
1013
+ ]:
1014
+ raise ValueError(f"{truncation_strategy=} is not supported by `MistralCommonBackend`.")
1015
+
1016
+ if num_tokens_to_remove <= 0:
1017
+ return ids, None, []
1427
1018
 
1428
1019
  overflowing_tokens = []
1429
- if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
1430
- if len(ids) > num_tokens_to_remove:
1431
- window_len = min(len(ids), stride + num_tokens_to_remove)
1432
- if self.truncation_side == "left":
1433
- overflowing_tokens = ids[:window_len]
1434
- ids = ids[num_tokens_to_remove:]
1435
- elif self.truncation_side == "right":
1436
- overflowing_tokens = ids[-window_len:]
1437
- ids = ids[:-num_tokens_to_remove]
1438
- else:
1439
- raise ValueError(f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.")
1440
1020
 
1021
+ if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
1022
+ window_len = min(len(ids), stride + num_tokens_to_remove)
1023
+ if self.truncation_side == "left":
1024
+ overflowing_tokens = ids[:window_len]
1025
+ ids = ids[num_tokens_to_remove:]
1441
1026
  else:
1442
- error_msg = (
1443
- f"We need to remove {num_tokens_to_remove} to truncate the input "
1444
- f"but the first sequence has a length {len(ids)}. "
1445
- )
1446
- logger.error(error_msg)
1027
+ overflowing_tokens = ids[-window_len:]
1028
+ ids = ids[:-num_tokens_to_remove]
1447
1029
 
1448
- return (ids, None, overflowing_tokens)
1030
+ return ids, None, overflowing_tokens
1449
1031
 
1450
- def apply_chat_template(
1032
+ def apply_chat_template( # type: ignore[override]
1451
1033
  self,
1452
1034
  conversation: list[dict[str, str]] | list[list[dict[str, str]]],
1453
1035
  tools: list[dict | Callable] | None = None,
@@ -1475,8 +1057,8 @@ class MistralCommonBackend(PushToHubMixin):
1475
1057
  [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
1476
1058
  for more information.
1477
1059
  add_generation_prompt (`bool`, *optional*):
1478
- This argument is a no-op for `MistralCommonBackend`. However it cannot be used at the same time as `continue_final_message` to keep the API consistent and
1479
- if any conversation ends with an assistant message, it will raise an error. In such case, use `continue_final_message` instead.
1060
+ This argument is a no-op for `MistralCommonBackend`. However, it cannot be used at the same time as `continue_final_message` to keep the API consistent.
1061
+ If any conversation ends with an assistant message, it will raise an error. In such cases, use `continue_final_message` instead.
1480
1062
  continue_final_message (bool, *optional*):
1481
1063
  If this is set, the chat will be formatted so that the final
1482
1064
  message in the chat is open-ended, without any EOS tokens. The model will continue this message
@@ -1511,8 +1093,7 @@ class MistralCommonBackend(PushToHubMixin):
1511
1093
  Will raise an error if used.
1512
1094
 
1513
1095
  Returns:
1514
- `Union[str, list[int], list[str], list[list[int]], BatchEncoding]`: A list of token ids representing the tokenized chat so far, including control
1515
- tokens. This output is ready to pass to the model, either directly or via methods like `generate()`.
1096
+ `Union[str, list[int], list[str], list[list[int]], BatchEncoding]`: The tokenized chat so far, including control tokens. This output is ready to pass to the model, either directly or via methods like `generate()`.
1516
1097
  """
1517
1098
  if kwargs:
1518
1099
  raise ValueError(
@@ -1659,6 +1240,83 @@ class MistralCommonBackend(PushToHubMixin):
1659
1240
  )
1660
1241
  return outputs
1661
1242
 
1243
+ def build_inputs_with_special_tokens(self, token_ids_0: list[int], token_ids_1: None = None) -> list[int]:
1244
+ """
1245
+ Build model inputs from a sequence by adding special tokens.
1246
+
1247
+ This method dynamically builds inputs based on the tokenizer's `mode`:
1248
+ - `"test"`: seq0 [EOS]
1249
+ - `"finetuning"`: [BOS] seq0
1250
+
1251
+ Args:
1252
+ token_ids_0 (`list[int]`):
1253
+ List of IDs to which the special tokens will be added.
1254
+ token_ids_1 (`None`, *optional*): None, kept to match Transformers' signature.
1255
+
1256
+ Returns:
1257
+ `list[int]`: List of input IDs with the appropriate special tokens.
1258
+ """
1259
+ if token_ids_1 is not None:
1260
+ raise ValueError(
1261
+ "`MistralCommonBackend` does not implement `token_ids_1 != None` for `build_inputs_with_special_tokens`."
1262
+ )
1263
+
1264
+ if self.mode == ValidationMode.test:
1265
+ # [BOS] seq0
1266
+ return [self.bos_token_id] + token_ids_0
1267
+
1268
+ else:
1269
+ # [BOS] seq0 [EOS]
1270
+ return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
1271
+
1272
+ def create_token_type_ids_from_sequences(self, token_ids_0: list[int], token_ids_1: None = None) -> list[int]:
1273
+ """
1274
+ Create a mask of zeroes from the token ids with special tokens added.
1275
+
1276
+ Kept to match Transformers' implementation.
1277
+
1278
+ Args:
1279
+ token_ids_0 (`list[int]`):
1280
+ List of IDs.
1281
+ token_ids_1 (`None`, *optional*): None, kept to match Transformers' signature.
1282
+
1283
+
1284
+ Returns:
1285
+ `list[int]`: Token type IDs according to the configured pattern.
1286
+ """
1287
+ if token_ids_1 is not None:
1288
+ raise ValueError(
1289
+ "`MistralCommonBackend` does not implement `token_ids_1 != None` for `create_token_type_ids_from_sequences`."
1290
+ )
1291
+
1292
+ sequence = self.build_inputs_with_special_tokens(token_ids_0)
1293
+
1294
+ return [0] * len(sequence)
1295
+
1296
+ def num_special_tokens_to_add(self, pair: Literal[False] = False) -> int:
1297
+ """
1298
+ Returns the number of added tokens when encoding a sequence with special tokens.
1299
+
1300
+ <Tip>
1301
+
1302
+ This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
1303
+ this inside your training loop.
1304
+
1305
+ </Tip>
1306
+
1307
+ Args:
1308
+ pair (`Literal[False]`, *optional*): False, kept to match Transformer's signature.
1309
+
1310
+ Returns:
1311
+ `int`: Number of special tokens added to sequences.
1312
+ """
1313
+ if pair:
1314
+ raise ValueError(
1315
+ "`MistralCommonBackend` does not implement `pair = True` for `num_special_tokens_to_add`."
1316
+ )
1317
+
1318
+ return len(self.build_inputs_with_special_tokens([], None))
1319
+
1662
1320
  @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
1663
1321
  def __call__(
1664
1322
  self,
@@ -1679,6 +1337,8 @@ class MistralCommonBackend(PushToHubMixin):
1679
1337
  return_special_tokens_mask: bool = False,
1680
1338
  return_length: bool = False,
1681
1339
  verbose: bool = True,
1340
+ return_offsets_mapping: Literal[False] = False,
1341
+ split_special_tokens: Literal[False] = False,
1682
1342
  **kwargs,
1683
1343
  ) -> BatchEncoding:
1684
1344
  """
@@ -1696,92 +1356,49 @@ class MistralCommonBackend(PushToHubMixin):
1696
1356
  text_pair_target (`None`, *optional*):
1697
1357
  Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
1698
1358
  """
1699
- if kwargs:
1700
- raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.__call__`.")
1359
+ if return_offsets_mapping or split_special_tokens:
1360
+ raise ValueError(
1361
+ "`MistralCommonBackend` does not support `return_offsets_mapping` and `split_special_tokens`."
1362
+ )
1701
1363
 
1702
- if text_pair or text_target or text_pair_target:
1364
+ if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND, "only_first", "only_second"]:
1703
1365
  raise ValueError(
1704
- "`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonBackend`."
1366
+ "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
1705
1367
  )
1706
1368
 
1707
- def _is_valid_text_input(t):
1708
- if isinstance(t, str):
1709
- # Strings are fine
1710
- return True
1711
- elif isinstance(t, (list, tuple)):
1712
- # List are fine as long as they are...
1713
- if len(t) == 0:
1714
- # ... empty
1715
- return True
1716
- elif isinstance(t[0], (str, int)):
1717
- # ... list of strings or int
1718
- return True
1719
- elif isinstance(t[0], (list, tuple)):
1720
- # ... list with an empty list or with a list of strings or with a list of ints
1721
- return len(t[0]) == 0 or isinstance(t[0][0], (str, int))
1722
- else:
1723
- return False
1724
- else:
1725
- return False
1369
+ if kwargs:
1370
+ raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.__call__`.")
1726
1371
 
1727
- if not _is_valid_text_input(text):
1372
+ if text_pair or text_target or text_pair_target:
1728
1373
  raise ValueError(
1729
- "text input must be of type `str` (single example), `list[str]` (batch or single encoded example) "
1730
- "or `list[list[int]]` (batch of encoded examples)."
1374
+ "`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonBackend`."
1731
1375
  )
1732
1376
 
1733
- is_batched = isinstance(text, (list, tuple)) and isinstance(text[0], (str, list, tuple))
1734
-
1735
- padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
1377
+ return super().__call__(
1378
+ text=text,
1379
+ text_pair=text_pair,
1380
+ text_target=text_target,
1381
+ add_special_tokens=add_special_tokens,
1736
1382
  padding=padding,
1737
1383
  truncation=truncation,
1738
1384
  max_length=max_length,
1385
+ stride=stride,
1739
1386
  pad_to_multiple_of=pad_to_multiple_of,
1387
+ padding_side=padding_side,
1388
+ return_tensors=return_tensors,
1389
+ return_attention_mask=return_attention_mask,
1390
+ return_overflowing_tokens=return_overflowing_tokens,
1391
+ return_special_tokens_mask=return_special_tokens_mask,
1392
+ return_length=return_length,
1740
1393
  verbose=verbose,
1741
- **kwargs,
1742
1394
  )
1743
1395
 
1744
- if is_batched:
1745
- return self._batch_encode_plus(
1746
- batch_text=text,
1747
- add_special_tokens=add_special_tokens,
1748
- padding_strategy=padding_strategy,
1749
- truncation_strategy=truncation_strategy,
1750
- max_length=max_length,
1751
- stride=stride,
1752
- pad_to_multiple_of=pad_to_multiple_of,
1753
- padding_side=padding_side,
1754
- return_tensors=return_tensors,
1755
- return_attention_mask=return_attention_mask,
1756
- return_overflowing_tokens=return_overflowing_tokens,
1757
- return_special_tokens_mask=return_special_tokens_mask,
1758
- return_length=return_length,
1759
- verbose=verbose,
1760
- )
1761
- else:
1762
- return self._encode_plus(
1763
- text=text,
1764
- add_special_tokens=add_special_tokens,
1765
- padding_strategy=padding_strategy,
1766
- truncation_strategy=truncation_strategy,
1767
- max_length=max_length,
1768
- stride=stride,
1769
- pad_to_multiple_of=pad_to_multiple_of,
1770
- padding_side=padding_side,
1771
- return_tensors=return_tensors,
1772
- return_attention_mask=return_attention_mask,
1773
- return_overflowing_tokens=return_overflowing_tokens,
1774
- return_special_tokens_mask=return_special_tokens_mask,
1775
- return_length=return_length,
1776
- verbose=verbose,
1777
- )
1778
-
1779
1396
  @classmethod
1780
1397
  def from_pretrained(
1781
1398
  cls,
1782
1399
  pretrained_model_name_or_path: str | os.PathLike,
1783
1400
  *init_inputs,
1784
- mode: Union[str, ValidationMode] = ValidationMode.test,
1401
+ mode: str | ValidationMode = ValidationMode.test,
1785
1402
  cache_dir: str | os.PathLike | None = None,
1786
1403
  force_download: bool = False,
1787
1404
  local_files_only: bool = False,
@@ -1808,9 +1425,9 @@ class MistralCommonBackend(PushToHubMixin):
1808
1425
  `./my_model_directory/`.
1809
1426
  mode (`Union[str, ValidationMode]`, *optional*, defaults to `ValidationMode.test`):
1810
1427
  Validation mode for the `MistralTokenizer` tokenizer. Possible values are:
1811
- - `"finetuning"` or `ValidationMode.finetuning`: The finetuning mode.
1428
+ - `"finetuning"` or `ValidationMode.finetuning`: The fine-tuning mode.
1812
1429
  - `"test"` or `ValidationMode.test`: The test mode.
1813
- It changes how the tokenizer validates the input and prepare the request to the model.
1430
+ It changes how the tokenizer validates the input and prepares the request to the model.
1814
1431
  cache_dir (`str` or `os.PathLike`, *optional*):
1815
1432
  Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
1816
1433
  standard cache should not be used.
@@ -1837,11 +1454,11 @@ class MistralCommonBackend(PushToHubMixin):
1837
1454
  Default value is picked from the class attribute of the same name.
1838
1455
  truncation_side (`str`, *optional*, defaults to `"right"`):
1839
1456
  The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
1840
- model_input_names (`List[string]`, *optional*):
1457
+ model_input_names (`List[str]`, *optional*):
1841
1458
  The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
1842
1459
  `"attention_mask"`). Default value is picked from the class attribute of the same name.
1843
1460
  clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
1844
- Whether or not the model should cleanup the spaces that were added when splitting the input text during the
1461
+ Whether or not the model should clean up the spaces that were added when splitting the input text during the
1845
1462
  tokenization process.
1846
1463
  kwargs (additional keyword arguments, *optional*):
1847
1464
  Not supported by `MistralCommonBackend.from_pretrained`.
@@ -1851,11 +1468,13 @@ class MistralCommonBackend(PushToHubMixin):
1851
1468
  raise ValueError("`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`.")
1852
1469
 
1853
1470
  # Handle kwargs and AutoTokenizer/AutoProcessor case
1854
- # These kwargs are passed by AutoTokenizer/AutoProcessor but are not used by MistralCommonBackend
1855
- if kwargs and not set(kwargs.keys()).issubset(
1856
- {"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "_from_auto", "subfolder"}
1857
- ):
1858
- raise ValueError(f"Some kwargs in {kwargs} are not supported by `MistralCommonBackend.from_pretrained`.")
1471
+ valid_kwargs = _VALID_INIT_KWARGS.union(
1472
+ {"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "subfolder"}
1473
+ )
1474
+ if kwargs and not set(kwargs.keys()).issubset(valid_kwargs):
1475
+ raise ValueError(
1476
+ f"Some kwargs in {list(kwargs.keys())} are not supported by `MistralCommonBackend.from_pretrained`."
1477
+ )
1859
1478
 
1860
1479
  mode = cls._get_validation_mode(mode)
1861
1480
 
@@ -1869,35 +1488,8 @@ class MistralCommonBackend(PushToHubMixin):
1869
1488
  local_files_only=local_files_only,
1870
1489
  )
1871
1490
  else:
1872
- valid_tokenizer_files = []
1873
- tokenizer_file: str
1874
-
1875
- instruct_versions = list(TokenizerVersion.__members__)
1876
- mm_versions = list(MultiModalVersion.__members__) + [""] # allow no mm version
1877
- sentencepiece_suffixes = [f".model.{v}{m}" for v in instruct_versions for m in mm_versions] + [".model"]
1878
-
1879
- for path in os.listdir(pretrained_model_name_or_path):
1880
- pathlib_repo_file = Path(path)
1881
- file_name = pathlib_repo_file.name
1882
- suffix = "".join(pathlib_repo_file.suffixes)
1883
- if file_name == "tekken.json" or suffix in sentencepiece_suffixes:
1884
- valid_tokenizer_files.append(file_name)
1885
-
1886
- if len(valid_tokenizer_files) == 0:
1887
- raise ValueError(f"No tokenizer file found in directory: {pretrained_model_name_or_path}")
1888
- # If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
1889
- if len(valid_tokenizer_files) > 1:
1890
- if "tekken.json" in valid_tokenizer_files:
1891
- tokenizer_file = "tekken.json"
1892
- else:
1893
- tokenizer_file = max(valid_tokenizer_files)
1894
- logger.warning(
1895
- f"Multiple tokenizer files found in directory: {pretrained_model_name_or_path}. Using {tokenizer_file}."
1896
- )
1897
- else:
1898
- tokenizer_file = valid_tokenizer_files[0]
1899
-
1900
- tokenizer_path = os.path.join(pretrained_model_name_or_path, tokenizer_file)
1491
+ candidate_files = os.listdir(pretrained_model_name_or_path)
1492
+ tokenizer_path = os.path.join(pretrained_model_name_or_path, get_one_valid_tokenizer_file(candidate_files))
1901
1493
 
1902
1494
  return cls(
1903
1495
  tokenizer_path=tokenizer_path,
@@ -1909,7 +1501,7 @@ class MistralCommonBackend(PushToHubMixin):
1909
1501
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
1910
1502
  )
1911
1503
 
1912
- def save_pretrained(
1504
+ def save_pretrained( # type: ignore[override]
1913
1505
  self,
1914
1506
  save_directory: str | os.PathLike | Path,
1915
1507
  push_to_hub: bool = False,
@@ -1971,7 +1563,7 @@ class MistralCommonBackend(PushToHubMixin):
1971
1563
  return (str(save_directory / self._tokenizer_path.name),)
1972
1564
 
1973
1565
  @staticmethod
1974
- def _get_validation_mode(mode: Union[str, ValidationMode]) -> ValidationMode:
1566
+ def _get_validation_mode(mode: str | ValidationMode) -> ValidationMode:
1975
1567
  """Get the validation mode from a string or a ValidationMode."""
1976
1568
  _invalid_mode_msg = (
1977
1569
  f"Invalid `mistral-common` tokenizer mode: {mode}. Possible values are 'finetuning' or 'test'."
@@ -1988,6 +1580,65 @@ class MistralCommonBackend(PushToHubMixin):
1988
1580
  raise ValueError(_invalid_mode_msg)
1989
1581
  return mode
1990
1582
 
1583
+ def add_special_tokens(
1584
+ self,
1585
+ special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
1586
+ replace_extra_special_tokens: bool = True,
1587
+ ):
1588
+ r"""`MistralCommonBackend` does not implement `add_special_tokens` by design.
1589
+
1590
+ If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
1591
+ """
1592
+
1593
+ raise NotImplementedError("`MistralCommonBackend` does not implement `add_special_tokens`.")
1594
+
1595
+ def add_tokens( # type: ignore[override]
1596
+ self,
1597
+ special_tokens_dict: dict[str, str | AddedToken | Sequence[str | AddedToken]],
1598
+ replace_extra_special_tokens: bool = True,
1599
+ ):
1600
+ """
1601
+ `MistralCommonBackend` does not implement `add_special_tokens` by design.
1602
+
1603
+ If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
1604
+ """
1605
+
1606
+ raise NotImplementedError("`MistralCommonBackend` does not implement `add_tokens`.")
1607
+
1608
+ def convert_added_tokens(cls, obj: AddedToken | Any, save: bool = False, add_type_field: bool = True): # type: ignore[override]
1609
+ """
1610
+ `MistralCommonBackend` does not implement `convert_added_tokens` by design.
1611
+
1612
+ If you would like this behaviour to be implemented, please open an issue in the `Transformers` or `mistral-common` repositories to request it.
1613
+ """
1614
+
1615
+ raise NotImplementedError("`MistralCommonBackend` does not implement `convert_added_tokens`.")
1616
+
1617
+ def get_chat_template(self, chat_template: str | None = None, tools: list[dict] | None = None) -> str:
1618
+ """`MistralCommonBackend` does not implement `get_chat_template` by design as `mistral-common` does not use chat templates."""
1619
+
1620
+ raise NotImplementedError("`MistralCommonBackend` does not implement `get_chat_template`.")
1621
+
1622
+ def save_chat_templates(
1623
+ self,
1624
+ save_directory: str | os.PathLike,
1625
+ tokenizer_config: dict,
1626
+ filename_prefix: str | None,
1627
+ save_jinja_files: bool,
1628
+ ):
1629
+ """`MistralCommonBackend` does not implement `save_chat_templates` by design as `mistral-common` does not use chat templates."""
1630
+
1631
+ raise NotImplementedError("`MistralCommonBackend` does not implement `save_chat_templates`.")
1632
+
1633
+ def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str, ...]:
1634
+ """
1635
+ `MistralCommonBackend` does not implement `save_vocabulary` by design.
1636
+
1637
+ This is because `mistral-common` is configured by one tokenizer file. If you'd like to save the vocabulary, please consider using the `save_pretrained` method instead.
1638
+ """
1639
+
1640
+ raise NotImplementedError("`MistralCommonBackend` does not implement `save_vocabulary`.")
1641
+
1991
1642
 
1992
1643
  # Backward compatibility alias for codebases still importing the legacy name.
1993
1644
  MistralCommonTokenizer = MistralCommonBackend