transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1584) hide show
  1. transformers/__init__.py +27 -27
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +32 -33
  4. transformers/cache_utils.py +32 -139
  5. transformers/cli/chat.py +3 -3
  6. transformers/cli/serve.py +2 -2
  7. transformers/cli/transformers.py +2 -1
  8. transformers/configuration_utils.py +143 -101
  9. transformers/conversion_mapping.py +73 -6
  10. transformers/convert_slow_tokenizer.py +3 -8
  11. transformers/core_model_loading.py +215 -50
  12. transformers/data/processors/glue.py +0 -1
  13. transformers/data/processors/utils.py +0 -1
  14. transformers/data/processors/xnli.py +0 -1
  15. transformers/dependency_versions_table.py +5 -5
  16. transformers/distributed/configuration_utils.py +1 -2
  17. transformers/dynamic_module_utils.py +23 -23
  18. transformers/feature_extraction_sequence_utils.py +19 -23
  19. transformers/feature_extraction_utils.py +63 -31
  20. transformers/generation/candidate_generator.py +80 -33
  21. transformers/generation/configuration_utils.py +186 -131
  22. transformers/generation/continuous_batching/__init__.py +0 -1
  23. transformers/generation/continuous_batching/cache.py +81 -24
  24. transformers/generation/continuous_batching/cache_manager.py +155 -45
  25. transformers/generation/continuous_batching/continuous_api.py +152 -84
  26. transformers/generation/continuous_batching/requests.py +51 -3
  27. transformers/generation/continuous_batching/scheduler.py +127 -52
  28. transformers/generation/logits_process.py +0 -128
  29. transformers/generation/stopping_criteria.py +1 -1
  30. transformers/generation/streamers.py +0 -1
  31. transformers/generation/utils.py +107 -119
  32. transformers/generation/watermarking.py +8 -6
  33. transformers/hf_argparser.py +9 -13
  34. transformers/hyperparameter_search.py +1 -2
  35. transformers/image_processing_base.py +11 -21
  36. transformers/image_processing_utils.py +11 -12
  37. transformers/image_processing_utils_fast.py +68 -57
  38. transformers/image_transforms.py +29 -29
  39. transformers/image_utils.py +30 -32
  40. transformers/initialization.py +37 -0
  41. transformers/integrations/__init__.py +12 -0
  42. transformers/integrations/accelerate.py +44 -111
  43. transformers/integrations/aqlm.py +3 -5
  44. transformers/integrations/awq.py +3 -8
  45. transformers/integrations/bitnet.py +5 -8
  46. transformers/integrations/bitsandbytes.py +16 -15
  47. transformers/integrations/deepspeed.py +19 -4
  48. transformers/integrations/eetq.py +3 -6
  49. transformers/integrations/fbgemm_fp8.py +2 -3
  50. transformers/integrations/finegrained_fp8.py +14 -23
  51. transformers/integrations/flash_attention.py +2 -2
  52. transformers/integrations/flex_attention.py +1 -1
  53. transformers/integrations/fp_quant.py +4 -6
  54. transformers/integrations/ggml.py +0 -1
  55. transformers/integrations/higgs.py +2 -5
  56. transformers/integrations/hub_kernels.py +23 -5
  57. transformers/integrations/integration_utils.py +37 -3
  58. transformers/integrations/mistral.py +12 -0
  59. transformers/integrations/moe.py +240 -0
  60. transformers/integrations/mxfp4.py +9 -16
  61. transformers/integrations/peft.py +5 -0
  62. transformers/integrations/quanto.py +5 -2
  63. transformers/integrations/quark.py +2 -4
  64. transformers/integrations/spqr.py +3 -5
  65. transformers/integrations/tensor_parallel.py +167 -221
  66. transformers/integrations/torchao.py +4 -6
  67. transformers/integrations/vptq.py +3 -5
  68. transformers/loss/loss_lw_detr.py +356 -0
  69. transformers/loss/loss_utils.py +2 -0
  70. transformers/masking_utils.py +47 -51
  71. transformers/model_debugging_utils.py +4 -5
  72. transformers/modelcard.py +14 -192
  73. transformers/modeling_attn_mask_utils.py +19 -19
  74. transformers/modeling_flash_attention_utils.py +27 -27
  75. transformers/modeling_gguf_pytorch_utils.py +71 -24
  76. transformers/modeling_layers.py +21 -22
  77. transformers/modeling_outputs.py +242 -253
  78. transformers/modeling_rope_utils.py +110 -113
  79. transformers/modeling_utils.py +633 -576
  80. transformers/models/__init__.py +23 -0
  81. transformers/models/afmoe/configuration_afmoe.py +26 -29
  82. transformers/models/afmoe/modeling_afmoe.py +37 -49
  83. transformers/models/afmoe/modular_afmoe.py +21 -31
  84. transformers/models/aimv2/configuration_aimv2.py +2 -5
  85. transformers/models/aimv2/modeling_aimv2.py +24 -21
  86. transformers/models/aimv2/modular_aimv2.py +11 -9
  87. transformers/models/albert/configuration_albert.py +0 -1
  88. transformers/models/albert/modeling_albert.py +70 -69
  89. transformers/models/albert/tokenization_albert.py +1 -4
  90. transformers/models/align/configuration_align.py +0 -1
  91. transformers/models/align/modeling_align.py +73 -68
  92. transformers/models/align/processing_align.py +2 -30
  93. transformers/models/altclip/configuration_altclip.py +0 -1
  94. transformers/models/altclip/modeling_altclip.py +83 -80
  95. transformers/models/altclip/processing_altclip.py +2 -15
  96. transformers/models/apertus/__init__.py +0 -1
  97. transformers/models/apertus/configuration_apertus.py +18 -21
  98. transformers/models/apertus/modeling_apertus.py +35 -36
  99. transformers/models/apertus/modular_apertus.py +32 -31
  100. transformers/models/arcee/configuration_arcee.py +20 -23
  101. transformers/models/arcee/modeling_arcee.py +32 -35
  102. transformers/models/arcee/modular_arcee.py +20 -23
  103. transformers/models/aria/configuration_aria.py +20 -23
  104. transformers/models/aria/image_processing_aria.py +25 -27
  105. transformers/models/aria/modeling_aria.py +71 -70
  106. transformers/models/aria/modular_aria.py +85 -88
  107. transformers/models/aria/processing_aria.py +28 -35
  108. transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +0 -1
  109. transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +3 -6
  110. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +6 -8
  111. transformers/models/audioflamingo3/__init__.py +0 -1
  112. transformers/models/audioflamingo3/configuration_audioflamingo3.py +0 -1
  113. transformers/models/audioflamingo3/modeling_audioflamingo3.py +22 -23
  114. transformers/models/audioflamingo3/modular_audioflamingo3.py +12 -17
  115. transformers/models/audioflamingo3/processing_audioflamingo3.py +33 -30
  116. transformers/models/auto/auto_factory.py +5 -6
  117. transformers/models/auto/configuration_auto.py +53 -5
  118. transformers/models/auto/feature_extraction_auto.py +12 -10
  119. transformers/models/auto/image_processing_auto.py +17 -28
  120. transformers/models/auto/modeling_auto.py +38 -188
  121. transformers/models/auto/processing_auto.py +6 -1
  122. transformers/models/auto/tokenization_auto.py +147 -169
  123. transformers/models/auto/video_processing_auto.py +12 -10
  124. transformers/models/autoformer/configuration_autoformer.py +4 -7
  125. transformers/models/autoformer/modeling_autoformer.py +98 -100
  126. transformers/models/aya_vision/configuration_aya_vision.py +0 -1
  127. transformers/models/aya_vision/modeling_aya_vision.py +42 -40
  128. transformers/models/aya_vision/modular_aya_vision.py +26 -29
  129. transformers/models/aya_vision/processing_aya_vision.py +25 -53
  130. transformers/models/bamba/configuration_bamba.py +29 -32
  131. transformers/models/bamba/modeling_bamba.py +78 -83
  132. transformers/models/bamba/modular_bamba.py +68 -71
  133. transformers/models/bark/configuration_bark.py +4 -7
  134. transformers/models/bark/generation_configuration_bark.py +3 -5
  135. transformers/models/bark/modeling_bark.py +49 -55
  136. transformers/models/bark/processing_bark.py +19 -41
  137. transformers/models/bart/configuration_bart.py +0 -2
  138. transformers/models/bart/modeling_bart.py +122 -117
  139. transformers/models/barthez/tokenization_barthez.py +1 -4
  140. transformers/models/bartpho/tokenization_bartpho.py +6 -7
  141. transformers/models/beit/configuration_beit.py +0 -11
  142. transformers/models/beit/image_processing_beit.py +53 -56
  143. transformers/models/beit/image_processing_beit_fast.py +8 -10
  144. transformers/models/beit/modeling_beit.py +51 -53
  145. transformers/models/bert/configuration_bert.py +0 -1
  146. transformers/models/bert/modeling_bert.py +114 -122
  147. transformers/models/bert/tokenization_bert.py +2 -4
  148. transformers/models/bert/tokenization_bert_legacy.py +3 -5
  149. transformers/models/bert_generation/configuration_bert_generation.py +0 -1
  150. transformers/models/bert_generation/modeling_bert_generation.py +49 -49
  151. transformers/models/bert_generation/tokenization_bert_generation.py +2 -3
  152. transformers/models/bert_japanese/tokenization_bert_japanese.py +5 -6
  153. transformers/models/bertweet/tokenization_bertweet.py +1 -3
  154. transformers/models/big_bird/configuration_big_bird.py +0 -1
  155. transformers/models/big_bird/modeling_big_bird.py +110 -109
  156. transformers/models/big_bird/tokenization_big_bird.py +1 -4
  157. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +0 -1
  158. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +116 -111
  159. transformers/models/biogpt/configuration_biogpt.py +0 -1
  160. transformers/models/biogpt/modeling_biogpt.py +69 -71
  161. transformers/models/biogpt/modular_biogpt.py +59 -61
  162. transformers/models/biogpt/tokenization_biogpt.py +3 -5
  163. transformers/models/bit/configuration_bit.py +0 -1
  164. transformers/models/bit/image_processing_bit.py +21 -24
  165. transformers/models/bit/image_processing_bit_fast.py +0 -1
  166. transformers/models/bit/modeling_bit.py +14 -12
  167. transformers/models/bitnet/configuration_bitnet.py +18 -21
  168. transformers/models/bitnet/modeling_bitnet.py +32 -35
  169. transformers/models/bitnet/modular_bitnet.py +4 -6
  170. transformers/models/blenderbot/configuration_blenderbot.py +0 -1
  171. transformers/models/blenderbot/modeling_blenderbot.py +71 -95
  172. transformers/models/blenderbot/tokenization_blenderbot.py +6 -8
  173. transformers/models/blenderbot_small/configuration_blenderbot_small.py +0 -1
  174. transformers/models/blenderbot_small/modeling_blenderbot_small.py +73 -68
  175. transformers/models/blenderbot_small/tokenization_blenderbot_small.py +1 -3
  176. transformers/models/blip/configuration_blip.py +0 -1
  177. transformers/models/blip/image_processing_blip.py +17 -20
  178. transformers/models/blip/image_processing_blip_fast.py +0 -1
  179. transformers/models/blip/modeling_blip.py +62 -71
  180. transformers/models/blip/modeling_blip_text.py +71 -65
  181. transformers/models/blip/processing_blip.py +5 -36
  182. transformers/models/blip_2/configuration_blip_2.py +0 -1
  183. transformers/models/blip_2/modeling_blip_2.py +72 -71
  184. transformers/models/blip_2/processing_blip_2.py +8 -38
  185. transformers/models/bloom/configuration_bloom.py +0 -1
  186. transformers/models/bloom/modeling_bloom.py +71 -103
  187. transformers/models/blt/configuration_blt.py +71 -74
  188. transformers/models/blt/modeling_blt.py +235 -78
  189. transformers/models/blt/modular_blt.py +225 -62
  190. transformers/models/bridgetower/configuration_bridgetower.py +0 -1
  191. transformers/models/bridgetower/image_processing_bridgetower.py +34 -35
  192. transformers/models/bridgetower/image_processing_bridgetower_fast.py +7 -10
  193. transformers/models/bridgetower/modeling_bridgetower.py +113 -109
  194. transformers/models/bridgetower/processing_bridgetower.py +2 -16
  195. transformers/models/bros/configuration_bros.py +0 -1
  196. transformers/models/bros/modeling_bros.py +86 -80
  197. transformers/models/bros/processing_bros.py +2 -12
  198. transformers/models/byt5/tokenization_byt5.py +4 -6
  199. transformers/models/camembert/configuration_camembert.py +0 -1
  200. transformers/models/camembert/modeling_camembert.py +196 -195
  201. transformers/models/camembert/modular_camembert.py +51 -54
  202. transformers/models/camembert/tokenization_camembert.py +1 -4
  203. transformers/models/canine/configuration_canine.py +0 -1
  204. transformers/models/canine/modeling_canine.py +79 -75
  205. transformers/models/canine/tokenization_canine.py +2 -1
  206. transformers/models/chameleon/configuration_chameleon.py +24 -27
  207. transformers/models/chameleon/image_processing_chameleon.py +21 -24
  208. transformers/models/chameleon/image_processing_chameleon_fast.py +0 -1
  209. transformers/models/chameleon/modeling_chameleon.py +62 -60
  210. transformers/models/chameleon/processing_chameleon.py +16 -41
  211. transformers/models/chinese_clip/configuration_chinese_clip.py +0 -1
  212. transformers/models/chinese_clip/image_processing_chinese_clip.py +21 -24
  213. transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +0 -1
  214. transformers/models/chinese_clip/modeling_chinese_clip.py +71 -69
  215. transformers/models/chinese_clip/processing_chinese_clip.py +2 -15
  216. transformers/models/clap/configuration_clap.py +0 -1
  217. transformers/models/clap/feature_extraction_clap.py +11 -12
  218. transformers/models/clap/modeling_clap.py +113 -104
  219. transformers/models/clap/processing_clap.py +2 -15
  220. transformers/models/clip/configuration_clip.py +0 -1
  221. transformers/models/clip/image_processing_clip.py +21 -24
  222. transformers/models/clip/image_processing_clip_fast.py +0 -1
  223. transformers/models/clip/modeling_clip.py +47 -46
  224. transformers/models/clip/processing_clip.py +2 -14
  225. transformers/models/clip/tokenization_clip.py +2 -5
  226. transformers/models/clipseg/configuration_clipseg.py +0 -1
  227. transformers/models/clipseg/modeling_clipseg.py +90 -87
  228. transformers/models/clipseg/processing_clipseg.py +8 -39
  229. transformers/models/clvp/configuration_clvp.py +1 -3
  230. transformers/models/clvp/feature_extraction_clvp.py +7 -10
  231. transformers/models/clvp/modeling_clvp.py +133 -118
  232. transformers/models/clvp/number_normalizer.py +1 -2
  233. transformers/models/clvp/processing_clvp.py +3 -20
  234. transformers/models/clvp/tokenization_clvp.py +0 -1
  235. transformers/models/code_llama/tokenization_code_llama.py +4 -7
  236. transformers/models/codegen/configuration_codegen.py +0 -1
  237. transformers/models/codegen/modeling_codegen.py +61 -52
  238. transformers/models/codegen/tokenization_codegen.py +5 -6
  239. transformers/models/cohere/configuration_cohere.py +20 -23
  240. transformers/models/cohere/modeling_cohere.py +36 -39
  241. transformers/models/cohere/modular_cohere.py +24 -28
  242. transformers/models/cohere/tokenization_cohere.py +5 -6
  243. transformers/models/cohere2/configuration_cohere2.py +21 -24
  244. transformers/models/cohere2/modeling_cohere2.py +35 -38
  245. transformers/models/cohere2/modular_cohere2.py +39 -41
  246. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +6 -8
  247. transformers/models/cohere2_vision/modeling_cohere2_vision.py +35 -33
  248. transformers/models/cohere2_vision/modular_cohere2_vision.py +21 -23
  249. transformers/models/cohere2_vision/processing_cohere2_vision.py +6 -36
  250. transformers/models/colpali/configuration_colpali.py +0 -1
  251. transformers/models/colpali/modeling_colpali.py +14 -16
  252. transformers/models/colpali/modular_colpali.py +11 -51
  253. transformers/models/colpali/processing_colpali.py +14 -52
  254. transformers/models/colqwen2/modeling_colqwen2.py +20 -22
  255. transformers/models/colqwen2/modular_colqwen2.py +29 -68
  256. transformers/models/colqwen2/processing_colqwen2.py +16 -52
  257. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -2
  258. transformers/models/conditional_detr/image_processing_conditional_detr.py +64 -66
  259. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +22 -22
  260. transformers/models/conditional_detr/modeling_conditional_detr.py +82 -81
  261. transformers/models/conditional_detr/modular_conditional_detr.py +1 -3
  262. transformers/models/convbert/configuration_convbert.py +0 -1
  263. transformers/models/convbert/modeling_convbert.py +88 -87
  264. transformers/models/convbert/tokenization_convbert.py +0 -1
  265. transformers/models/convnext/configuration_convnext.py +0 -1
  266. transformers/models/convnext/image_processing_convnext.py +20 -23
  267. transformers/models/convnext/image_processing_convnext_fast.py +14 -19
  268. transformers/models/convnext/modeling_convnext.py +5 -8
  269. transformers/models/convnextv2/configuration_convnextv2.py +0 -1
  270. transformers/models/convnextv2/modeling_convnextv2.py +5 -8
  271. transformers/models/cpm/tokenization_cpm.py +6 -7
  272. transformers/models/cpm/tokenization_cpm_fast.py +3 -5
  273. transformers/models/cpmant/configuration_cpmant.py +0 -1
  274. transformers/models/cpmant/modeling_cpmant.py +38 -40
  275. transformers/models/cpmant/tokenization_cpmant.py +1 -3
  276. transformers/models/csm/configuration_csm.py +49 -51
  277. transformers/models/csm/generation_csm.py +31 -35
  278. transformers/models/csm/modeling_csm.py +81 -82
  279. transformers/models/csm/modular_csm.py +58 -58
  280. transformers/models/csm/processing_csm.py +25 -68
  281. transformers/models/ctrl/configuration_ctrl.py +0 -1
  282. transformers/models/ctrl/modeling_ctrl.py +52 -43
  283. transformers/models/ctrl/tokenization_ctrl.py +0 -1
  284. transformers/models/cvt/configuration_cvt.py +0 -1
  285. transformers/models/cvt/modeling_cvt.py +18 -16
  286. transformers/models/cwm/__init__.py +0 -1
  287. transformers/models/cwm/configuration_cwm.py +3 -5
  288. transformers/models/cwm/modeling_cwm.py +33 -35
  289. transformers/models/cwm/modular_cwm.py +10 -12
  290. transformers/models/d_fine/configuration_d_fine.py +3 -5
  291. transformers/models/d_fine/modeling_d_fine.py +127 -121
  292. transformers/models/d_fine/modular_d_fine.py +23 -13
  293. transformers/models/dab_detr/configuration_dab_detr.py +2 -3
  294. transformers/models/dab_detr/modeling_dab_detr.py +69 -71
  295. transformers/models/dac/configuration_dac.py +0 -1
  296. transformers/models/dac/feature_extraction_dac.py +6 -9
  297. transformers/models/dac/modeling_dac.py +21 -23
  298. transformers/models/data2vec/configuration_data2vec_audio.py +0 -1
  299. transformers/models/data2vec/configuration_data2vec_text.py +0 -1
  300. transformers/models/data2vec/configuration_data2vec_vision.py +0 -1
  301. transformers/models/data2vec/modeling_data2vec_audio.py +52 -56
  302. transformers/models/data2vec/modeling_data2vec_text.py +98 -93
  303. transformers/models/data2vec/modeling_data2vec_vision.py +41 -42
  304. transformers/models/data2vec/modular_data2vec_audio.py +6 -1
  305. transformers/models/data2vec/modular_data2vec_text.py +58 -54
  306. transformers/models/dbrx/configuration_dbrx.py +27 -20
  307. transformers/models/dbrx/modeling_dbrx.py +40 -43
  308. transformers/models/dbrx/modular_dbrx.py +31 -33
  309. transformers/models/deberta/configuration_deberta.py +0 -1
  310. transformers/models/deberta/modeling_deberta.py +59 -60
  311. transformers/models/deberta/tokenization_deberta.py +2 -5
  312. transformers/models/deberta_v2/configuration_deberta_v2.py +0 -1
  313. transformers/models/deberta_v2/modeling_deberta_v2.py +65 -65
  314. transformers/models/deberta_v2/tokenization_deberta_v2.py +1 -4
  315. transformers/models/decision_transformer/configuration_decision_transformer.py +0 -1
  316. transformers/models/decision_transformer/modeling_decision_transformer.py +56 -55
  317. transformers/models/deepseek_v2/configuration_deepseek_v2.py +34 -37
  318. transformers/models/deepseek_v2/modeling_deepseek_v2.py +39 -37
  319. transformers/models/deepseek_v2/modular_deepseek_v2.py +44 -44
  320. transformers/models/deepseek_v3/configuration_deepseek_v3.py +35 -38
  321. transformers/models/deepseek_v3/modeling_deepseek_v3.py +40 -38
  322. transformers/models/deepseek_v3/modular_deepseek_v3.py +10 -7
  323. transformers/models/deepseek_vl/configuration_deepseek_vl.py +2 -3
  324. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +25 -26
  325. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +7 -7
  326. transformers/models/deepseek_vl/modeling_deepseek_vl.py +40 -36
  327. transformers/models/deepseek_vl/modular_deepseek_vl.py +14 -43
  328. transformers/models/deepseek_vl/processing_deepseek_vl.py +10 -41
  329. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +3 -5
  330. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +35 -35
  331. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +16 -20
  332. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +42 -38
  333. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +80 -99
  334. transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +12 -44
  335. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -3
  336. transformers/models/deformable_detr/image_processing_deformable_detr.py +59 -61
  337. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +17 -17
  338. transformers/models/deformable_detr/modeling_deformable_detr.py +67 -68
  339. transformers/models/deformable_detr/modular_deformable_detr.py +1 -3
  340. transformers/models/deit/configuration_deit.py +0 -1
  341. transformers/models/deit/image_processing_deit.py +18 -21
  342. transformers/models/deit/image_processing_deit_fast.py +0 -1
  343. transformers/models/deit/modeling_deit.py +16 -18
  344. transformers/models/depth_anything/configuration_depth_anything.py +2 -4
  345. transformers/models/depth_anything/modeling_depth_anything.py +5 -8
  346. transformers/models/depth_pro/configuration_depth_pro.py +0 -1
  347. transformers/models/depth_pro/image_processing_depth_pro.py +22 -23
  348. transformers/models/depth_pro/image_processing_depth_pro_fast.py +6 -8
  349. transformers/models/depth_pro/modeling_depth_pro.py +21 -23
  350. transformers/models/detr/configuration_detr.py +1 -2
  351. transformers/models/detr/image_processing_detr.py +64 -66
  352. transformers/models/detr/image_processing_detr_fast.py +22 -23
  353. transformers/models/detr/modeling_detr.py +78 -73
  354. transformers/models/dia/configuration_dia.py +5 -8
  355. transformers/models/dia/feature_extraction_dia.py +6 -9
  356. transformers/models/dia/generation_dia.py +42 -45
  357. transformers/models/dia/modeling_dia.py +73 -65
  358. transformers/models/dia/modular_dia.py +63 -54
  359. transformers/models/dia/processing_dia.py +39 -29
  360. transformers/models/dia/tokenization_dia.py +3 -6
  361. transformers/models/diffllama/configuration_diffllama.py +20 -23
  362. transformers/models/diffllama/modeling_diffllama.py +44 -47
  363. transformers/models/diffllama/modular_diffllama.py +17 -19
  364. transformers/models/dinat/configuration_dinat.py +0 -1
  365. transformers/models/dinat/modeling_dinat.py +40 -42
  366. transformers/models/dinov2/configuration_dinov2.py +0 -1
  367. transformers/models/dinov2/modeling_dinov2.py +11 -13
  368. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +1 -1
  369. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +12 -13
  370. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +5 -7
  371. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +4 -7
  372. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +3 -6
  373. transformers/models/dinov3_vit/configuration_dinov3_vit.py +5 -8
  374. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +5 -7
  375. transformers/models/dinov3_vit/modeling_dinov3_vit.py +17 -16
  376. transformers/models/dinov3_vit/modular_dinov3_vit.py +14 -13
  377. transformers/models/distilbert/configuration_distilbert.py +0 -1
  378. transformers/models/distilbert/modeling_distilbert.py +55 -55
  379. transformers/models/distilbert/tokenization_distilbert.py +0 -1
  380. transformers/models/doge/__init__.py +0 -1
  381. transformers/models/doge/configuration_doge.py +25 -28
  382. transformers/models/doge/modeling_doge.py +43 -46
  383. transformers/models/doge/modular_doge.py +57 -58
  384. transformers/models/donut/configuration_donut_swin.py +0 -1
  385. transformers/models/donut/image_processing_donut.py +26 -29
  386. transformers/models/donut/image_processing_donut_fast.py +5 -11
  387. transformers/models/donut/modeling_donut_swin.py +60 -58
  388. transformers/models/donut/processing_donut.py +5 -26
  389. transformers/models/dots1/configuration_dots1.py +27 -29
  390. transformers/models/dots1/modeling_dots1.py +45 -39
  391. transformers/models/dots1/modular_dots1.py +0 -1
  392. transformers/models/dpr/configuration_dpr.py +0 -1
  393. transformers/models/dpr/modeling_dpr.py +37 -39
  394. transformers/models/dpr/tokenization_dpr.py +7 -9
  395. transformers/models/dpr/tokenization_dpr_fast.py +7 -9
  396. transformers/models/dpt/configuration_dpt.py +1 -2
  397. transformers/models/dpt/image_processing_dpt.py +65 -66
  398. transformers/models/dpt/image_processing_dpt_fast.py +14 -16
  399. transformers/models/dpt/modeling_dpt.py +19 -21
  400. transformers/models/dpt/modular_dpt.py +11 -13
  401. transformers/models/edgetam/configuration_edgetam.py +1 -2
  402. transformers/models/edgetam/modeling_edgetam.py +44 -43
  403. transformers/models/edgetam/modular_edgetam.py +17 -20
  404. transformers/models/edgetam_video/__init__.py +0 -1
  405. transformers/models/edgetam_video/configuration_edgetam_video.py +0 -1
  406. transformers/models/edgetam_video/modeling_edgetam_video.py +131 -120
  407. transformers/models/edgetam_video/modular_edgetam_video.py +29 -37
  408. transformers/models/efficientloftr/configuration_efficientloftr.py +4 -5
  409. transformers/models/efficientloftr/image_processing_efficientloftr.py +14 -16
  410. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +5 -6
  411. transformers/models/efficientloftr/modeling_efficientloftr.py +41 -30
  412. transformers/models/efficientloftr/modular_efficientloftr.py +1 -3
  413. transformers/models/efficientnet/configuration_efficientnet.py +0 -1
  414. transformers/models/efficientnet/image_processing_efficientnet.py +28 -32
  415. transformers/models/efficientnet/image_processing_efficientnet_fast.py +15 -17
  416. transformers/models/efficientnet/modeling_efficientnet.py +17 -15
  417. transformers/models/electra/configuration_electra.py +0 -1
  418. transformers/models/electra/modeling_electra.py +108 -103
  419. transformers/models/emu3/configuration_emu3.py +5 -7
  420. transformers/models/emu3/image_processing_emu3.py +44 -39
  421. transformers/models/emu3/modeling_emu3.py +67 -64
  422. transformers/models/emu3/modular_emu3.py +39 -35
  423. transformers/models/emu3/processing_emu3.py +18 -43
  424. transformers/models/encodec/configuration_encodec.py +2 -4
  425. transformers/models/encodec/feature_extraction_encodec.py +10 -13
  426. transformers/models/encodec/modeling_encodec.py +39 -29
  427. transformers/models/encoder_decoder/configuration_encoder_decoder.py +0 -1
  428. transformers/models/encoder_decoder/modeling_encoder_decoder.py +17 -19
  429. transformers/models/eomt/configuration_eomt.py +0 -1
  430. transformers/models/eomt/image_processing_eomt.py +53 -55
  431. transformers/models/eomt/image_processing_eomt_fast.py +59 -28
  432. transformers/models/eomt/modeling_eomt.py +23 -18
  433. transformers/models/eomt/modular_eomt.py +18 -13
  434. transformers/models/ernie/configuration_ernie.py +0 -1
  435. transformers/models/ernie/modeling_ernie.py +127 -132
  436. transformers/models/ernie/modular_ernie.py +97 -103
  437. transformers/models/ernie4_5/configuration_ernie4_5.py +18 -20
  438. transformers/models/ernie4_5/modeling_ernie4_5.py +32 -34
  439. transformers/models/ernie4_5/modular_ernie4_5.py +1 -3
  440. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +27 -29
  441. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +52 -51
  442. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +16 -44
  443. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  444. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +329 -0
  445. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +455 -0
  446. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +231 -0
  447. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1895 -0
  448. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1901 -0
  449. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +249 -0
  450. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +593 -0
  451. transformers/models/esm/configuration_esm.py +2 -4
  452. transformers/models/esm/modeling_esm.py +38 -34
  453. transformers/models/esm/modeling_esmfold.py +48 -45
  454. transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
  455. transformers/models/esm/openfold_utils/loss.py +1 -2
  456. transformers/models/esm/openfold_utils/protein.py +13 -13
  457. transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
  458. transformers/models/esm/tokenization_esm.py +2 -4
  459. transformers/models/evolla/configuration_evolla.py +29 -32
  460. transformers/models/evolla/modeling_evolla.py +67 -62
  461. transformers/models/evolla/modular_evolla.py +53 -47
  462. transformers/models/evolla/processing_evolla.py +23 -35
  463. transformers/models/exaone4/configuration_exaone4.py +19 -22
  464. transformers/models/exaone4/modeling_exaone4.py +33 -36
  465. transformers/models/exaone4/modular_exaone4.py +40 -42
  466. transformers/models/falcon/configuration_falcon.py +22 -25
  467. transformers/models/falcon/modeling_falcon.py +75 -78
  468. transformers/models/falcon_h1/configuration_falcon_h1.py +40 -43
  469. transformers/models/falcon_h1/modeling_falcon_h1.py +80 -78
  470. transformers/models/falcon_h1/modular_falcon_h1.py +54 -50
  471. transformers/models/falcon_mamba/configuration_falcon_mamba.py +0 -1
  472. transformers/models/falcon_mamba/modeling_falcon_mamba.py +50 -47
  473. transformers/models/falcon_mamba/modular_falcon_mamba.py +16 -14
  474. transformers/models/fast_vlm/configuration_fast_vlm.py +1 -0
  475. transformers/models/fast_vlm/modeling_fast_vlm.py +43 -39
  476. transformers/models/fast_vlm/modular_fast_vlm.py +2 -3
  477. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +2 -5
  478. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +68 -57
  479. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +2 -3
  480. transformers/models/flaubert/configuration_flaubert.py +0 -1
  481. transformers/models/flaubert/modeling_flaubert.py +138 -143
  482. transformers/models/flaubert/tokenization_flaubert.py +3 -5
  483. transformers/models/flava/configuration_flava.py +5 -6
  484. transformers/models/flava/image_processing_flava.py +66 -67
  485. transformers/models/flava/image_processing_flava_fast.py +42 -45
  486. transformers/models/flava/modeling_flava.py +111 -107
  487. transformers/models/flava/processing_flava.py +2 -12
  488. transformers/models/flex_olmo/__init__.py +0 -1
  489. transformers/models/flex_olmo/configuration_flex_olmo.py +23 -25
  490. transformers/models/flex_olmo/modeling_flex_olmo.py +44 -43
  491. transformers/models/flex_olmo/modular_flex_olmo.py +35 -37
  492. transformers/models/florence2/configuration_florence2.py +0 -1
  493. transformers/models/florence2/modeling_florence2.py +59 -43
  494. transformers/models/florence2/modular_florence2.py +65 -81
  495. transformers/models/florence2/processing_florence2.py +18 -47
  496. transformers/models/fnet/configuration_fnet.py +0 -1
  497. transformers/models/fnet/modeling_fnet.py +76 -80
  498. transformers/models/fnet/tokenization_fnet.py +0 -1
  499. transformers/models/focalnet/configuration_focalnet.py +0 -1
  500. transformers/models/focalnet/modeling_focalnet.py +39 -41
  501. transformers/models/fsmt/configuration_fsmt.py +0 -1
  502. transformers/models/fsmt/modeling_fsmt.py +47 -48
  503. transformers/models/fsmt/tokenization_fsmt.py +3 -5
  504. transformers/models/funnel/configuration_funnel.py +0 -1
  505. transformers/models/funnel/modeling_funnel.py +91 -93
  506. transformers/models/funnel/tokenization_funnel.py +2 -5
  507. transformers/models/fuyu/configuration_fuyu.py +23 -26
  508. transformers/models/fuyu/image_processing_fuyu.py +29 -31
  509. transformers/models/fuyu/image_processing_fuyu_fast.py +12 -13
  510. transformers/models/fuyu/modeling_fuyu.py +29 -30
  511. transformers/models/fuyu/processing_fuyu.py +23 -34
  512. transformers/models/gemma/configuration_gemma.py +20 -23
  513. transformers/models/gemma/modeling_gemma.py +42 -46
  514. transformers/models/gemma/modular_gemma.py +37 -40
  515. transformers/models/gemma/tokenization_gemma.py +3 -6
  516. transformers/models/gemma2/configuration_gemma2.py +25 -28
  517. transformers/models/gemma2/modeling_gemma2.py +35 -38
  518. transformers/models/gemma2/modular_gemma2.py +56 -58
  519. transformers/models/gemma3/configuration_gemma3.py +28 -29
  520. transformers/models/gemma3/image_processing_gemma3.py +29 -31
  521. transformers/models/gemma3/image_processing_gemma3_fast.py +9 -11
  522. transformers/models/gemma3/modeling_gemma3.py +112 -94
  523. transformers/models/gemma3/modular_gemma3.py +110 -91
  524. transformers/models/gemma3/processing_gemma3.py +5 -5
  525. transformers/models/gemma3n/configuration_gemma3n.py +12 -10
  526. transformers/models/gemma3n/feature_extraction_gemma3n.py +9 -11
  527. transformers/models/gemma3n/modeling_gemma3n.py +127 -98
  528. transformers/models/gemma3n/modular_gemma3n.py +117 -84
  529. transformers/models/gemma3n/processing_gemma3n.py +12 -26
  530. transformers/models/git/configuration_git.py +0 -1
  531. transformers/models/git/modeling_git.py +250 -197
  532. transformers/models/git/processing_git.py +2 -14
  533. transformers/models/glm/configuration_glm.py +19 -21
  534. transformers/models/glm/modeling_glm.py +33 -36
  535. transformers/models/glm/modular_glm.py +4 -7
  536. transformers/models/glm4/configuration_glm4.py +19 -21
  537. transformers/models/glm4/modeling_glm4.py +36 -38
  538. transformers/models/glm4/modular_glm4.py +8 -10
  539. transformers/models/glm46v/configuration_glm46v.py +0 -1
  540. transformers/models/glm46v/image_processing_glm46v.py +35 -40
  541. transformers/models/glm46v/image_processing_glm46v_fast.py +7 -7
  542. transformers/models/glm46v/modeling_glm46v.py +54 -52
  543. transformers/models/glm46v/modular_glm46v.py +4 -3
  544. transformers/models/glm46v/processing_glm46v.py +7 -41
  545. transformers/models/glm46v/video_processing_glm46v.py +9 -11
  546. transformers/models/glm4_moe/configuration_glm4_moe.py +25 -28
  547. transformers/models/glm4_moe/modeling_glm4_moe.py +41 -40
  548. transformers/models/glm4_moe/modular_glm4_moe.py +27 -30
  549. transformers/models/glm4_moe_lite/__init__.py +28 -0
  550. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +235 -0
  551. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +740 -0
  552. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +304 -0
  553. transformers/models/glm4v/configuration_glm4v.py +14 -17
  554. transformers/models/glm4v/image_processing_glm4v.py +34 -40
  555. transformers/models/glm4v/image_processing_glm4v_fast.py +6 -7
  556. transformers/models/glm4v/modeling_glm4v.py +148 -156
  557. transformers/models/glm4v/modular_glm4v.py +142 -185
  558. transformers/models/glm4v/processing_glm4v.py +7 -41
  559. transformers/models/glm4v/video_processing_glm4v.py +9 -11
  560. transformers/models/glm4v_moe/configuration_glm4v_moe.py +119 -122
  561. transformers/models/glm4v_moe/modeling_glm4v_moe.py +275 -319
  562. transformers/models/glm4v_moe/modular_glm4v_moe.py +66 -163
  563. transformers/models/glm_image/__init__.py +31 -0
  564. transformers/models/glm_image/configuration_glm_image.py +352 -0
  565. transformers/models/glm_image/image_processing_glm_image.py +503 -0
  566. transformers/models/glm_image/image_processing_glm_image_fast.py +296 -0
  567. transformers/models/glm_image/modeling_glm_image.py +1590 -0
  568. transformers/models/glm_image/modular_glm_image.py +1480 -0
  569. transformers/models/glm_image/processing_glm_image.py +217 -0
  570. transformers/models/glmasr/__init__.py +29 -0
  571. transformers/models/glmasr/configuration_glmasr.py +196 -0
  572. transformers/models/glmasr/modeling_glmasr.py +511 -0
  573. transformers/models/glmasr/modular_glmasr.py +431 -0
  574. transformers/models/glmasr/processing_glmasr.py +331 -0
  575. transformers/models/glpn/configuration_glpn.py +0 -1
  576. transformers/models/glpn/image_processing_glpn.py +11 -12
  577. transformers/models/glpn/image_processing_glpn_fast.py +8 -10
  578. transformers/models/glpn/modeling_glpn.py +10 -12
  579. transformers/models/got_ocr2/configuration_got_ocr2.py +5 -8
  580. transformers/models/got_ocr2/image_processing_got_ocr2.py +22 -24
  581. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +6 -8
  582. transformers/models/got_ocr2/modeling_got_ocr2.py +48 -45
  583. transformers/models/got_ocr2/modular_got_ocr2.py +31 -34
  584. transformers/models/got_ocr2/processing_got_ocr2.py +42 -63
  585. transformers/models/gpt2/configuration_gpt2.py +0 -1
  586. transformers/models/gpt2/modeling_gpt2.py +114 -113
  587. transformers/models/gpt2/tokenization_gpt2.py +6 -9
  588. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +0 -1
  589. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +76 -88
  590. transformers/models/gpt_neo/configuration_gpt_neo.py +0 -1
  591. transformers/models/gpt_neo/modeling_gpt_neo.py +77 -66
  592. transformers/models/gpt_neox/configuration_gpt_neox.py +19 -22
  593. transformers/models/gpt_neox/modeling_gpt_neox.py +71 -73
  594. transformers/models/gpt_neox/modular_gpt_neox.py +64 -66
  595. transformers/models/gpt_neox/tokenization_gpt_neox.py +2 -5
  596. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +15 -18
  597. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +42 -45
  598. transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +1 -3
  599. transformers/models/gpt_oss/configuration_gpt_oss.py +38 -24
  600. transformers/models/gpt_oss/modeling_gpt_oss.py +40 -44
  601. transformers/models/gpt_oss/modular_gpt_oss.py +22 -26
  602. transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
  603. transformers/models/gptj/configuration_gptj.py +0 -1
  604. transformers/models/gptj/modeling_gptj.py +96 -86
  605. transformers/models/granite/configuration_granite.py +23 -26
  606. transformers/models/granite/modeling_granite.py +40 -42
  607. transformers/models/granite/modular_granite.py +29 -31
  608. transformers/models/granite_speech/configuration_granite_speech.py +0 -1
  609. transformers/models/granite_speech/feature_extraction_granite_speech.py +1 -3
  610. transformers/models/granite_speech/modeling_granite_speech.py +36 -24
  611. transformers/models/granite_speech/processing_granite_speech.py +11 -4
  612. transformers/models/granitemoe/configuration_granitemoe.py +26 -29
  613. transformers/models/granitemoe/modeling_granitemoe.py +37 -40
  614. transformers/models/granitemoe/modular_granitemoe.py +22 -25
  615. transformers/models/granitemoehybrid/__init__.py +0 -1
  616. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +41 -40
  617. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +92 -86
  618. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +29 -21
  619. transformers/models/granitemoeshared/configuration_granitemoeshared.py +27 -30
  620. transformers/models/granitemoeshared/modeling_granitemoeshared.py +50 -55
  621. transformers/models/granitemoeshared/modular_granitemoeshared.py +19 -21
  622. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -4
  623. transformers/models/grounding_dino/image_processing_grounding_dino.py +60 -62
  624. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +17 -18
  625. transformers/models/grounding_dino/modeling_grounding_dino.py +95 -97
  626. transformers/models/grounding_dino/modular_grounding_dino.py +2 -3
  627. transformers/models/grounding_dino/processing_grounding_dino.py +10 -38
  628. transformers/models/groupvit/configuration_groupvit.py +0 -1
  629. transformers/models/groupvit/modeling_groupvit.py +75 -71
  630. transformers/models/helium/configuration_helium.py +20 -22
  631. transformers/models/helium/modeling_helium.py +34 -37
  632. transformers/models/helium/modular_helium.py +3 -7
  633. transformers/models/herbert/tokenization_herbert.py +4 -6
  634. transformers/models/hgnet_v2/configuration_hgnet_v2.py +0 -1
  635. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -9
  636. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -9
  637. transformers/models/hiera/configuration_hiera.py +0 -1
  638. transformers/models/hiera/modeling_hiera.py +60 -62
  639. transformers/models/hubert/configuration_hubert.py +0 -1
  640. transformers/models/hubert/modeling_hubert.py +39 -37
  641. transformers/models/hubert/modular_hubert.py +12 -11
  642. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +21 -24
  643. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +31 -34
  644. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +4 -6
  645. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  646. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +25 -28
  647. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +44 -39
  648. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +9 -9
  649. transformers/models/ibert/configuration_ibert.py +0 -1
  650. transformers/models/ibert/modeling_ibert.py +76 -62
  651. transformers/models/ibert/quant_modules.py +0 -1
  652. transformers/models/idefics/configuration_idefics.py +0 -1
  653. transformers/models/idefics/image_processing_idefics.py +13 -15
  654. transformers/models/idefics/modeling_idefics.py +70 -61
  655. transformers/models/idefics/perceiver.py +1 -3
  656. transformers/models/idefics/processing_idefics.py +32 -48
  657. transformers/models/idefics/vision.py +22 -24
  658. transformers/models/idefics2/configuration_idefics2.py +0 -1
  659. transformers/models/idefics2/image_processing_idefics2.py +31 -32
  660. transformers/models/idefics2/image_processing_idefics2_fast.py +7 -8
  661. transformers/models/idefics2/modeling_idefics2.py +63 -59
  662. transformers/models/idefics2/processing_idefics2.py +10 -68
  663. transformers/models/idefics3/configuration_idefics3.py +0 -1
  664. transformers/models/idefics3/image_processing_idefics3.py +42 -43
  665. transformers/models/idefics3/image_processing_idefics3_fast.py +11 -12
  666. transformers/models/idefics3/modeling_idefics3.py +57 -55
  667. transformers/models/idefics3/processing_idefics3.py +15 -69
  668. transformers/models/ijepa/configuration_ijepa.py +0 -1
  669. transformers/models/ijepa/modeling_ijepa.py +10 -11
  670. transformers/models/ijepa/modular_ijepa.py +5 -7
  671. transformers/models/imagegpt/configuration_imagegpt.py +0 -1
  672. transformers/models/imagegpt/image_processing_imagegpt.py +17 -18
  673. transformers/models/imagegpt/image_processing_imagegpt_fast.py +9 -14
  674. transformers/models/imagegpt/modeling_imagegpt.py +66 -60
  675. transformers/models/informer/configuration_informer.py +6 -9
  676. transformers/models/informer/modeling_informer.py +84 -86
  677. transformers/models/informer/modular_informer.py +13 -16
  678. transformers/models/instructblip/configuration_instructblip.py +0 -1
  679. transformers/models/instructblip/modeling_instructblip.py +45 -44
  680. transformers/models/instructblip/processing_instructblip.py +10 -36
  681. transformers/models/instructblipvideo/configuration_instructblipvideo.py +0 -1
  682. transformers/models/instructblipvideo/modeling_instructblipvideo.py +107 -105
  683. transformers/models/instructblipvideo/modular_instructblipvideo.py +34 -36
  684. transformers/models/instructblipvideo/processing_instructblipvideo.py +14 -33
  685. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +4 -6
  686. transformers/models/internvl/configuration_internvl.py +0 -1
  687. transformers/models/internvl/modeling_internvl.py +52 -51
  688. transformers/models/internvl/modular_internvl.py +24 -30
  689. transformers/models/internvl/processing_internvl.py +12 -45
  690. transformers/models/internvl/video_processing_internvl.py +8 -10
  691. transformers/models/jais2/__init__.py +27 -0
  692. transformers/models/jais2/configuration_jais2.py +150 -0
  693. transformers/models/jais2/modeling_jais2.py +484 -0
  694. transformers/models/jais2/modular_jais2.py +194 -0
  695. transformers/models/jamba/configuration_jamba.py +0 -1
  696. transformers/models/jamba/modeling_jamba.py +67 -65
  697. transformers/models/jamba/modular_jamba.py +54 -55
  698. transformers/models/janus/configuration_janus.py +0 -1
  699. transformers/models/janus/image_processing_janus.py +35 -37
  700. transformers/models/janus/image_processing_janus_fast.py +12 -14
  701. transformers/models/janus/modeling_janus.py +56 -50
  702. transformers/models/janus/modular_janus.py +76 -70
  703. transformers/models/janus/processing_janus.py +17 -43
  704. transformers/models/jetmoe/configuration_jetmoe.py +20 -23
  705. transformers/models/jetmoe/modeling_jetmoe.py +41 -44
  706. transformers/models/jetmoe/modular_jetmoe.py +31 -33
  707. transformers/models/kosmos2/configuration_kosmos2.py +0 -1
  708. transformers/models/kosmos2/modeling_kosmos2.py +159 -148
  709. transformers/models/kosmos2/processing_kosmos2.py +40 -55
  710. transformers/models/kosmos2_5/__init__.py +0 -1
  711. transformers/models/kosmos2_5/configuration_kosmos2_5.py +0 -1
  712. transformers/models/kosmos2_5/image_processing_kosmos2_5.py +10 -12
  713. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +4 -13
  714. transformers/models/kosmos2_5/modeling_kosmos2_5.py +118 -110
  715. transformers/models/kosmos2_5/processing_kosmos2_5.py +8 -29
  716. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +23 -25
  717. transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +12 -14
  718. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +67 -68
  719. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +28 -22
  720. transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +2 -8
  721. transformers/models/lasr/configuration_lasr.py +5 -3
  722. transformers/models/lasr/feature_extraction_lasr.py +10 -12
  723. transformers/models/lasr/modeling_lasr.py +21 -23
  724. transformers/models/lasr/modular_lasr.py +16 -11
  725. transformers/models/lasr/processing_lasr.py +12 -8
  726. transformers/models/lasr/tokenization_lasr.py +2 -4
  727. transformers/models/layoutlm/configuration_layoutlm.py +0 -1
  728. transformers/models/layoutlm/modeling_layoutlm.py +72 -72
  729. transformers/models/layoutlmv2/configuration_layoutlmv2.py +0 -1
  730. transformers/models/layoutlmv2/image_processing_layoutlmv2.py +18 -21
  731. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +5 -7
  732. transformers/models/layoutlmv2/modeling_layoutlmv2.py +60 -50
  733. transformers/models/layoutlmv2/processing_layoutlmv2.py +14 -44
  734. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +64 -74
  735. transformers/models/layoutlmv3/configuration_layoutlmv3.py +0 -1
  736. transformers/models/layoutlmv3/image_processing_layoutlmv3.py +24 -26
  737. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +7 -9
  738. transformers/models/layoutlmv3/modeling_layoutlmv3.py +78 -56
  739. transformers/models/layoutlmv3/processing_layoutlmv3.py +14 -46
  740. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +64 -75
  741. transformers/models/layoutxlm/configuration_layoutxlm.py +0 -1
  742. transformers/models/layoutxlm/modular_layoutxlm.py +0 -1
  743. transformers/models/layoutxlm/processing_layoutxlm.py +14 -44
  744. transformers/models/layoutxlm/tokenization_layoutxlm.py +65 -76
  745. transformers/models/led/configuration_led.py +1 -4
  746. transformers/models/led/modeling_led.py +119 -267
  747. transformers/models/levit/configuration_levit.py +0 -1
  748. transformers/models/levit/image_processing_levit.py +19 -21
  749. transformers/models/levit/image_processing_levit_fast.py +0 -1
  750. transformers/models/levit/modeling_levit.py +35 -19
  751. transformers/models/lfm2/configuration_lfm2.py +22 -23
  752. transformers/models/lfm2/modeling_lfm2.py +43 -45
  753. transformers/models/lfm2/modular_lfm2.py +29 -29
  754. transformers/models/lfm2_moe/__init__.py +0 -1
  755. transformers/models/lfm2_moe/configuration_lfm2_moe.py +1 -2
  756. transformers/models/lfm2_moe/modeling_lfm2_moe.py +58 -49
  757. transformers/models/lfm2_moe/modular_lfm2_moe.py +13 -37
  758. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -1
  759. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +34 -5
  760. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -38
  761. transformers/models/lfm2_vl/modular_lfm2_vl.py +28 -29
  762. transformers/models/lfm2_vl/processing_lfm2_vl.py +96 -76
  763. transformers/models/lightglue/image_processing_lightglue.py +16 -15
  764. transformers/models/lightglue/image_processing_lightglue_fast.py +5 -6
  765. transformers/models/lightglue/modeling_lightglue.py +28 -30
  766. transformers/models/lightglue/modular_lightglue.py +28 -28
  767. transformers/models/lighton_ocr/__init__.py +28 -0
  768. transformers/models/lighton_ocr/configuration_lighton_ocr.py +128 -0
  769. transformers/models/lighton_ocr/modeling_lighton_ocr.py +460 -0
  770. transformers/models/lighton_ocr/modular_lighton_ocr.py +403 -0
  771. transformers/models/lighton_ocr/processing_lighton_ocr.py +229 -0
  772. transformers/models/lilt/configuration_lilt.py +0 -1
  773. transformers/models/lilt/modeling_lilt.py +72 -70
  774. transformers/models/llama/configuration_llama.py +21 -24
  775. transformers/models/llama/modeling_llama.py +32 -35
  776. transformers/models/llama/tokenization_llama.py +2 -4
  777. transformers/models/llama4/configuration_llama4.py +20 -22
  778. transformers/models/llama4/image_processing_llama4_fast.py +9 -11
  779. transformers/models/llama4/modeling_llama4.py +78 -75
  780. transformers/models/llama4/processing_llama4.py +33 -57
  781. transformers/models/llava/configuration_llava.py +0 -1
  782. transformers/models/llava/image_processing_llava.py +25 -28
  783. transformers/models/llava/image_processing_llava_fast.py +6 -8
  784. transformers/models/llava/modeling_llava.py +47 -44
  785. transformers/models/llava/processing_llava.py +18 -51
  786. transformers/models/llava_next/configuration_llava_next.py +0 -1
  787. transformers/models/llava_next/image_processing_llava_next.py +43 -45
  788. transformers/models/llava_next/image_processing_llava_next_fast.py +5 -7
  789. transformers/models/llava_next/modeling_llava_next.py +49 -47
  790. transformers/models/llava_next/processing_llava_next.py +18 -47
  791. transformers/models/llava_next_video/configuration_llava_next_video.py +0 -1
  792. transformers/models/llava_next_video/modeling_llava_next_video.py +60 -58
  793. transformers/models/llava_next_video/modular_llava_next_video.py +51 -49
  794. transformers/models/llava_next_video/processing_llava_next_video.py +21 -63
  795. transformers/models/llava_next_video/video_processing_llava_next_video.py +0 -1
  796. transformers/models/llava_onevision/configuration_llava_onevision.py +0 -1
  797. transformers/models/llava_onevision/image_processing_llava_onevision.py +40 -42
  798. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +6 -8
  799. transformers/models/llava_onevision/modeling_llava_onevision.py +67 -65
  800. transformers/models/llava_onevision/modular_llava_onevision.py +58 -56
  801. transformers/models/llava_onevision/processing_llava_onevision.py +21 -53
  802. transformers/models/llava_onevision/video_processing_llava_onevision.py +0 -1
  803. transformers/models/longcat_flash/__init__.py +0 -1
  804. transformers/models/longcat_flash/configuration_longcat_flash.py +32 -35
  805. transformers/models/longcat_flash/modeling_longcat_flash.py +32 -32
  806. transformers/models/longcat_flash/modular_longcat_flash.py +18 -19
  807. transformers/models/longformer/configuration_longformer.py +1 -4
  808. transformers/models/longformer/modeling_longformer.py +99 -101
  809. transformers/models/longt5/configuration_longt5.py +0 -1
  810. transformers/models/longt5/modeling_longt5.py +43 -48
  811. transformers/models/luke/configuration_luke.py +0 -1
  812. transformers/models/luke/modeling_luke.py +179 -181
  813. transformers/models/luke/tokenization_luke.py +99 -105
  814. transformers/models/lw_detr/__init__.py +27 -0
  815. transformers/models/lw_detr/configuration_lw_detr.py +374 -0
  816. transformers/models/lw_detr/modeling_lw_detr.py +1698 -0
  817. transformers/models/lw_detr/modular_lw_detr.py +1611 -0
  818. transformers/models/lxmert/configuration_lxmert.py +0 -1
  819. transformers/models/lxmert/modeling_lxmert.py +63 -74
  820. transformers/models/m2m_100/configuration_m2m_100.py +0 -1
  821. transformers/models/m2m_100/modeling_m2m_100.py +79 -71
  822. transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
  823. transformers/models/mamba/configuration_mamba.py +0 -1
  824. transformers/models/mamba/modeling_mamba.py +44 -44
  825. transformers/models/mamba2/configuration_mamba2.py +0 -1
  826. transformers/models/mamba2/modeling_mamba2.py +67 -68
  827. transformers/models/marian/configuration_marian.py +1 -2
  828. transformers/models/marian/modeling_marian.py +87 -86
  829. transformers/models/marian/tokenization_marian.py +6 -6
  830. transformers/models/markuplm/configuration_markuplm.py +0 -1
  831. transformers/models/markuplm/feature_extraction_markuplm.py +1 -2
  832. transformers/models/markuplm/modeling_markuplm.py +65 -70
  833. transformers/models/markuplm/processing_markuplm.py +31 -38
  834. transformers/models/markuplm/tokenization_markuplm.py +67 -77
  835. transformers/models/mask2former/configuration_mask2former.py +5 -8
  836. transformers/models/mask2former/image_processing_mask2former.py +84 -85
  837. transformers/models/mask2former/image_processing_mask2former_fast.py +30 -33
  838. transformers/models/mask2former/modeling_mask2former.py +99 -92
  839. transformers/models/mask2former/modular_mask2former.py +6 -8
  840. transformers/models/maskformer/configuration_maskformer.py +6 -9
  841. transformers/models/maskformer/configuration_maskformer_swin.py +0 -1
  842. transformers/models/maskformer/image_processing_maskformer.py +84 -85
  843. transformers/models/maskformer/image_processing_maskformer_fast.py +29 -33
  844. transformers/models/maskformer/modeling_maskformer.py +65 -59
  845. transformers/models/maskformer/modeling_maskformer_swin.py +34 -32
  846. transformers/models/mbart/configuration_mbart.py +1 -1
  847. transformers/models/mbart/modeling_mbart.py +118 -113
  848. transformers/models/mbart/tokenization_mbart.py +2 -4
  849. transformers/models/mbart50/tokenization_mbart50.py +3 -5
  850. transformers/models/megatron_bert/configuration_megatron_bert.py +0 -1
  851. transformers/models/megatron_bert/modeling_megatron_bert.py +141 -150
  852. transformers/models/metaclip_2/modeling_metaclip_2.py +48 -46
  853. transformers/models/metaclip_2/modular_metaclip_2.py +21 -21
  854. transformers/models/mgp_str/configuration_mgp_str.py +0 -1
  855. transformers/models/mgp_str/modeling_mgp_str.py +14 -16
  856. transformers/models/mgp_str/processing_mgp_str.py +3 -20
  857. transformers/models/mgp_str/tokenization_mgp_str.py +1 -3
  858. transformers/models/mimi/configuration_mimi.py +38 -40
  859. transformers/models/mimi/modeling_mimi.py +100 -82
  860. transformers/models/minimax/__init__.py +0 -1
  861. transformers/models/minimax/configuration_minimax.py +32 -36
  862. transformers/models/minimax/modeling_minimax.py +57 -47
  863. transformers/models/minimax/modular_minimax.py +62 -54
  864. transformers/models/minimax_m2/__init__.py +28 -0
  865. transformers/models/minimax_m2/configuration_minimax_m2.py +211 -0
  866. transformers/models/minimax_m2/modeling_minimax_m2.py +704 -0
  867. transformers/models/minimax_m2/modular_minimax_m2.py +369 -0
  868. transformers/models/ministral/configuration_ministral.py +20 -22
  869. transformers/models/ministral/modeling_ministral.py +32 -34
  870. transformers/models/ministral/modular_ministral.py +27 -29
  871. transformers/models/ministral3/configuration_ministral3.py +19 -22
  872. transformers/models/ministral3/modeling_ministral3.py +32 -34
  873. transformers/models/ministral3/modular_ministral3.py +4 -5
  874. transformers/models/mistral/configuration_mistral.py +19 -22
  875. transformers/models/mistral/modeling_mistral.py +32 -34
  876. transformers/models/mistral/modular_mistral.py +11 -12
  877. transformers/models/mistral3/configuration_mistral3.py +0 -1
  878. transformers/models/mistral3/modeling_mistral3.py +53 -46
  879. transformers/models/mistral3/modular_mistral3.py +38 -36
  880. transformers/models/mixtral/configuration_mixtral.py +24 -27
  881. transformers/models/mixtral/modeling_mixtral.py +47 -42
  882. transformers/models/mixtral/modular_mixtral.py +32 -31
  883. transformers/models/mlcd/configuration_mlcd.py +0 -1
  884. transformers/models/mlcd/modeling_mlcd.py +16 -12
  885. transformers/models/mlcd/modular_mlcd.py +13 -11
  886. transformers/models/mllama/configuration_mllama.py +5 -8
  887. transformers/models/mllama/image_processing_mllama.py +23 -25
  888. transformers/models/mllama/image_processing_mllama_fast.py +5 -6
  889. transformers/models/mllama/modeling_mllama.py +94 -86
  890. transformers/models/mllama/processing_mllama.py +6 -55
  891. transformers/models/mluke/tokenization_mluke.py +97 -103
  892. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -3
  893. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +95 -97
  894. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -3
  895. transformers/models/mobilebert/configuration_mobilebert.py +0 -1
  896. transformers/models/mobilebert/modeling_mobilebert.py +77 -85
  897. transformers/models/mobilebert/tokenization_mobilebert.py +0 -1
  898. transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +0 -1
  899. transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +20 -23
  900. transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +0 -1
  901. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +13 -16
  902. transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +0 -1
  903. transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +48 -51
  904. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +10 -12
  905. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +17 -20
  906. transformers/models/mobilevit/configuration_mobilevit.py +0 -1
  907. transformers/models/mobilevit/image_processing_mobilevit.py +46 -49
  908. transformers/models/mobilevit/image_processing_mobilevit_fast.py +9 -11
  909. transformers/models/mobilevit/modeling_mobilevit.py +21 -19
  910. transformers/models/mobilevitv2/configuration_mobilevitv2.py +0 -1
  911. transformers/models/mobilevitv2/modeling_mobilevitv2.py +21 -20
  912. transformers/models/modernbert/configuration_modernbert.py +34 -34
  913. transformers/models/modernbert/modeling_modernbert.py +135 -126
  914. transformers/models/modernbert/modular_modernbert.py +167 -156
  915. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +30 -32
  916. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +54 -48
  917. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +78 -71
  918. transformers/models/moonshine/configuration_moonshine.py +22 -24
  919. transformers/models/moonshine/modeling_moonshine.py +64 -66
  920. transformers/models/moonshine/modular_moonshine.py +72 -73
  921. transformers/models/moshi/configuration_moshi.py +18 -21
  922. transformers/models/moshi/modeling_moshi.py +150 -183
  923. transformers/models/mpnet/configuration_mpnet.py +0 -1
  924. transformers/models/mpnet/modeling_mpnet.py +57 -57
  925. transformers/models/mpnet/tokenization_mpnet.py +1 -4
  926. transformers/models/mpt/configuration_mpt.py +1 -9
  927. transformers/models/mpt/modeling_mpt.py +58 -60
  928. transformers/models/mra/configuration_mra.py +0 -1
  929. transformers/models/mra/modeling_mra.py +58 -57
  930. transformers/models/mt5/configuration_mt5.py +2 -4
  931. transformers/models/mt5/modeling_mt5.py +75 -87
  932. transformers/models/musicgen/configuration_musicgen.py +0 -1
  933. transformers/models/musicgen/modeling_musicgen.py +113 -120
  934. transformers/models/musicgen/processing_musicgen.py +3 -21
  935. transformers/models/musicgen_melody/configuration_musicgen_melody.py +0 -1
  936. transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +8 -9
  937. transformers/models/musicgen_melody/modeling_musicgen_melody.py +110 -109
  938. transformers/models/musicgen_melody/processing_musicgen_melody.py +3 -22
  939. transformers/models/mvp/configuration_mvp.py +0 -1
  940. transformers/models/mvp/modeling_mvp.py +122 -119
  941. transformers/models/myt5/tokenization_myt5.py +8 -10
  942. transformers/models/nanochat/configuration_nanochat.py +0 -1
  943. transformers/models/nanochat/modeling_nanochat.py +33 -36
  944. transformers/models/nanochat/modular_nanochat.py +12 -14
  945. transformers/models/nemotron/configuration_nemotron.py +20 -23
  946. transformers/models/nemotron/modeling_nemotron.py +51 -54
  947. transformers/models/nllb/tokenization_nllb.py +7 -9
  948. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -1
  949. transformers/models/nllb_moe/modeling_nllb_moe.py +77 -69
  950. transformers/models/nougat/image_processing_nougat.py +29 -32
  951. transformers/models/nougat/image_processing_nougat_fast.py +4 -6
  952. transformers/models/nougat/processing_nougat.py +37 -39
  953. transformers/models/nougat/tokenization_nougat.py +16 -23
  954. transformers/models/nystromformer/configuration_nystromformer.py +0 -1
  955. transformers/models/nystromformer/modeling_nystromformer.py +68 -63
  956. transformers/models/olmo/configuration_olmo.py +18 -21
  957. transformers/models/olmo/modeling_olmo.py +32 -35
  958. transformers/models/olmo/modular_olmo.py +5 -9
  959. transformers/models/olmo2/configuration_olmo2.py +18 -21
  960. transformers/models/olmo2/modeling_olmo2.py +33 -36
  961. transformers/models/olmo2/modular_olmo2.py +29 -31
  962. transformers/models/olmo3/__init__.py +0 -1
  963. transformers/models/olmo3/configuration_olmo3.py +20 -23
  964. transformers/models/olmo3/modeling_olmo3.py +32 -35
  965. transformers/models/olmo3/modular_olmo3.py +31 -33
  966. transformers/models/olmoe/configuration_olmoe.py +24 -26
  967. transformers/models/olmoe/modeling_olmoe.py +49 -43
  968. transformers/models/olmoe/modular_olmoe.py +16 -15
  969. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -3
  970. transformers/models/omdet_turbo/modeling_omdet_turbo.py +42 -40
  971. transformers/models/omdet_turbo/processing_omdet_turbo.py +19 -67
  972. transformers/models/oneformer/configuration_oneformer.py +5 -8
  973. transformers/models/oneformer/image_processing_oneformer.py +83 -84
  974. transformers/models/oneformer/image_processing_oneformer_fast.py +33 -34
  975. transformers/models/oneformer/modeling_oneformer.py +130 -162
  976. transformers/models/oneformer/processing_oneformer.py +28 -43
  977. transformers/models/openai/configuration_openai.py +0 -1
  978. transformers/models/openai/modeling_openai.py +62 -51
  979. transformers/models/openai/tokenization_openai.py +2 -5
  980. transformers/models/opt/configuration_opt.py +0 -1
  981. transformers/models/opt/modeling_opt.py +74 -75
  982. transformers/models/ovis2/__init__.py +0 -1
  983. transformers/models/ovis2/configuration_ovis2.py +0 -1
  984. transformers/models/ovis2/image_processing_ovis2.py +22 -24
  985. transformers/models/ovis2/image_processing_ovis2_fast.py +6 -8
  986. transformers/models/ovis2/modeling_ovis2.py +58 -48
  987. transformers/models/ovis2/modular_ovis2.py +38 -32
  988. transformers/models/ovis2/processing_ovis2.py +12 -40
  989. transformers/models/owlv2/configuration_owlv2.py +0 -1
  990. transformers/models/owlv2/image_processing_owlv2.py +20 -21
  991. transformers/models/owlv2/image_processing_owlv2_fast.py +7 -10
  992. transformers/models/owlv2/modeling_owlv2.py +89 -90
  993. transformers/models/owlv2/modular_owlv2.py +6 -9
  994. transformers/models/owlv2/processing_owlv2.py +20 -49
  995. transformers/models/owlvit/configuration_owlvit.py +0 -1
  996. transformers/models/owlvit/image_processing_owlvit.py +21 -22
  997. transformers/models/owlvit/image_processing_owlvit_fast.py +2 -3
  998. transformers/models/owlvit/modeling_owlvit.py +88 -89
  999. transformers/models/owlvit/processing_owlvit.py +20 -48
  1000. transformers/models/paddleocr_vl/__init__.py +0 -1
  1001. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +19 -19
  1002. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +37 -37
  1003. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +12 -12
  1004. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +104 -90
  1005. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +90 -80
  1006. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +1 -3
  1007. transformers/models/paligemma/configuration_paligemma.py +0 -1
  1008. transformers/models/paligemma/modeling_paligemma.py +73 -67
  1009. transformers/models/paligemma/processing_paligemma.py +13 -66
  1010. transformers/models/parakeet/configuration_parakeet.py +1 -4
  1011. transformers/models/parakeet/feature_extraction_parakeet.py +10 -12
  1012. transformers/models/parakeet/modeling_parakeet.py +23 -22
  1013. transformers/models/parakeet/modular_parakeet.py +21 -18
  1014. transformers/models/parakeet/processing_parakeet.py +12 -5
  1015. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +5 -7
  1016. transformers/models/patchtsmixer/configuration_patchtsmixer.py +5 -8
  1017. transformers/models/patchtsmixer/modeling_patchtsmixer.py +64 -62
  1018. transformers/models/patchtst/configuration_patchtst.py +6 -9
  1019. transformers/models/patchtst/modeling_patchtst.py +77 -78
  1020. transformers/models/pe_audio/__init__.py +29 -0
  1021. transformers/models/pe_audio/configuration_pe_audio.py +204 -0
  1022. transformers/models/pe_audio/feature_extraction_pe_audio.py +160 -0
  1023. transformers/models/pe_audio/modeling_pe_audio.py +819 -0
  1024. transformers/models/pe_audio/modular_pe_audio.py +298 -0
  1025. transformers/models/pe_audio/processing_pe_audio.py +23 -0
  1026. transformers/models/pe_audio_video/__init__.py +28 -0
  1027. transformers/models/pe_audio_video/configuration_pe_audio_video.py +223 -0
  1028. transformers/models/pe_audio_video/modeling_pe_audio_video.py +971 -0
  1029. transformers/models/pe_audio_video/modular_pe_audio_video.py +763 -0
  1030. transformers/models/pe_audio_video/processing_pe_audio_video.py +24 -0
  1031. transformers/models/pe_video/__init__.py +29 -0
  1032. transformers/models/pe_video/configuration_pe_video.py +209 -0
  1033. transformers/models/pe_video/modeling_pe_video.py +635 -0
  1034. transformers/models/pe_video/modular_pe_video.py +218 -0
  1035. transformers/models/pe_video/processing_pe_video.py +10 -0
  1036. transformers/models/pe_video/video_processing_pe_video.py +64 -0
  1037. transformers/models/pegasus/configuration_pegasus.py +1 -1
  1038. transformers/models/pegasus/modeling_pegasus.py +66 -65
  1039. transformers/models/pegasus/tokenization_pegasus.py +1 -4
  1040. transformers/models/pegasus_x/configuration_pegasus_x.py +0 -1
  1041. transformers/models/pegasus_x/modeling_pegasus_x.py +51 -52
  1042. transformers/models/perceiver/configuration_perceiver.py +0 -1
  1043. transformers/models/perceiver/image_processing_perceiver.py +22 -25
  1044. transformers/models/perceiver/image_processing_perceiver_fast.py +5 -7
  1045. transformers/models/perceiver/modeling_perceiver.py +140 -137
  1046. transformers/models/perceiver/tokenization_perceiver.py +3 -6
  1047. transformers/models/perception_lm/configuration_perception_lm.py +0 -1
  1048. transformers/models/perception_lm/image_processing_perception_lm_fast.py +8 -10
  1049. transformers/models/perception_lm/modeling_perception_lm.py +45 -43
  1050. transformers/models/perception_lm/modular_perception_lm.py +38 -36
  1051. transformers/models/perception_lm/processing_perception_lm.py +13 -47
  1052. transformers/models/perception_lm/video_processing_perception_lm.py +0 -1
  1053. transformers/models/persimmon/configuration_persimmon.py +18 -21
  1054. transformers/models/persimmon/modeling_persimmon.py +40 -43
  1055. transformers/models/phi/configuration_phi.py +19 -22
  1056. transformers/models/phi/modeling_phi.py +36 -38
  1057. transformers/models/phi/modular_phi.py +23 -23
  1058. transformers/models/phi3/configuration_phi3.py +23 -26
  1059. transformers/models/phi3/modeling_phi3.py +34 -37
  1060. transformers/models/phi3/modular_phi3.py +13 -17
  1061. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +25 -26
  1062. transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +7 -9
  1063. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +7 -7
  1064. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +58 -57
  1065. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +62 -60
  1066. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +7 -44
  1067. transformers/models/phimoe/configuration_phimoe.py +26 -29
  1068. transformers/models/phimoe/modeling_phimoe.py +47 -42
  1069. transformers/models/phimoe/modular_phimoe.py +1 -2
  1070. transformers/models/phobert/tokenization_phobert.py +4 -6
  1071. transformers/models/pix2struct/configuration_pix2struct.py +0 -1
  1072. transformers/models/pix2struct/image_processing_pix2struct.py +15 -19
  1073. transformers/models/pix2struct/image_processing_pix2struct_fast.py +7 -10
  1074. transformers/models/pix2struct/modeling_pix2struct.py +42 -45
  1075. transformers/models/pix2struct/processing_pix2struct.py +5 -30
  1076. transformers/models/pixio/__init__.py +29 -0
  1077. transformers/models/pixio/configuration_pixio.py +150 -0
  1078. transformers/models/pixio/modeling_pixio.py +505 -0
  1079. transformers/models/pixio/modular_pixio.py +401 -0
  1080. transformers/models/pixtral/configuration_pixtral.py +11 -14
  1081. transformers/models/pixtral/image_processing_pixtral.py +26 -28
  1082. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -6
  1083. transformers/models/pixtral/modeling_pixtral.py +23 -26
  1084. transformers/models/pixtral/processing_pixtral.py +21 -53
  1085. transformers/models/plbart/configuration_plbart.py +1 -1
  1086. transformers/models/plbart/modeling_plbart.py +107 -102
  1087. transformers/models/plbart/modular_plbart.py +36 -32
  1088. transformers/models/plbart/tokenization_plbart.py +4 -5
  1089. transformers/models/poolformer/configuration_poolformer.py +0 -1
  1090. transformers/models/poolformer/image_processing_poolformer.py +21 -24
  1091. transformers/models/poolformer/image_processing_poolformer_fast.py +6 -8
  1092. transformers/models/poolformer/modeling_poolformer.py +21 -13
  1093. transformers/models/pop2piano/configuration_pop2piano.py +0 -2
  1094. transformers/models/pop2piano/feature_extraction_pop2piano.py +6 -9
  1095. transformers/models/pop2piano/modeling_pop2piano.py +22 -23
  1096. transformers/models/pop2piano/processing_pop2piano.py +25 -33
  1097. transformers/models/pop2piano/tokenization_pop2piano.py +15 -23
  1098. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +3 -3
  1099. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
  1100. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +14 -15
  1101. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +9 -10
  1102. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +9 -10
  1103. transformers/models/prophetnet/configuration_prophetnet.py +26 -28
  1104. transformers/models/prophetnet/modeling_prophetnet.py +111 -131
  1105. transformers/models/prophetnet/tokenization_prophetnet.py +14 -16
  1106. transformers/models/pvt/configuration_pvt.py +0 -1
  1107. transformers/models/pvt/image_processing_pvt.py +17 -20
  1108. transformers/models/pvt/image_processing_pvt_fast.py +0 -1
  1109. transformers/models/pvt/modeling_pvt.py +19 -21
  1110. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  1111. transformers/models/pvt_v2/modeling_pvt_v2.py +21 -23
  1112. transformers/models/qwen2/configuration_qwen2.py +18 -21
  1113. transformers/models/qwen2/modeling_qwen2.py +32 -34
  1114. transformers/models/qwen2/modular_qwen2.py +11 -12
  1115. transformers/models/qwen2/tokenization_qwen2.py +2 -5
  1116. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +20 -23
  1117. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +239 -192
  1118. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +174 -127
  1119. transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +41 -49
  1120. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +22 -25
  1121. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +112 -101
  1122. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +72 -107
  1123. transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +7 -43
  1124. transformers/models/qwen2_audio/configuration_qwen2_audio.py +0 -1
  1125. transformers/models/qwen2_audio/modeling_qwen2_audio.py +29 -31
  1126. transformers/models/qwen2_audio/processing_qwen2_audio.py +13 -42
  1127. transformers/models/qwen2_moe/configuration_qwen2_moe.py +28 -31
  1128. transformers/models/qwen2_moe/modeling_qwen2_moe.py +48 -43
  1129. transformers/models/qwen2_moe/modular_qwen2_moe.py +7 -10
  1130. transformers/models/qwen2_vl/configuration_qwen2_vl.py +22 -24
  1131. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +41 -42
  1132. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +8 -9
  1133. transformers/models/qwen2_vl/modeling_qwen2_vl.py +108 -96
  1134. transformers/models/qwen2_vl/processing_qwen2_vl.py +7 -44
  1135. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +35 -13
  1136. transformers/models/qwen3/configuration_qwen3.py +20 -23
  1137. transformers/models/qwen3/modeling_qwen3.py +32 -35
  1138. transformers/models/qwen3/modular_qwen3.py +4 -6
  1139. transformers/models/qwen3_moe/configuration_qwen3_moe.py +25 -28
  1140. transformers/models/qwen3_moe/modeling_qwen3_moe.py +48 -43
  1141. transformers/models/qwen3_moe/modular_qwen3_moe.py +10 -13
  1142. transformers/models/qwen3_next/configuration_qwen3_next.py +31 -34
  1143. transformers/models/qwen3_next/modeling_qwen3_next.py +43 -48
  1144. transformers/models/qwen3_next/modular_qwen3_next.py +33 -34
  1145. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +89 -88
  1146. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +199 -156
  1147. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +170 -152
  1148. transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +40 -48
  1149. transformers/models/qwen3_vl/configuration_qwen3_vl.py +21 -24
  1150. transformers/models/qwen3_vl/modeling_qwen3_vl.py +91 -81
  1151. transformers/models/qwen3_vl/modular_qwen3_vl.py +86 -112
  1152. transformers/models/qwen3_vl/processing_qwen3_vl.py +6 -42
  1153. transformers/models/qwen3_vl/video_processing_qwen3_vl.py +10 -12
  1154. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +21 -25
  1155. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +174 -195
  1156. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +65 -117
  1157. transformers/models/rag/configuration_rag.py +0 -9
  1158. transformers/models/rag/modeling_rag.py +123 -127
  1159. transformers/models/rag/retrieval_rag.py +2 -4
  1160. transformers/models/rag/tokenization_rag.py +0 -50
  1161. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +21 -24
  1162. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +34 -36
  1163. transformers/models/reformer/configuration_reformer.py +0 -1
  1164. transformers/models/reformer/modeling_reformer.py +76 -69
  1165. transformers/models/reformer/tokenization_reformer.py +3 -6
  1166. transformers/models/regnet/configuration_regnet.py +0 -1
  1167. transformers/models/regnet/modeling_regnet.py +11 -9
  1168. transformers/models/rembert/configuration_rembert.py +0 -1
  1169. transformers/models/rembert/modeling_rembert.py +115 -111
  1170. transformers/models/rembert/tokenization_rembert.py +1 -4
  1171. transformers/models/resnet/configuration_resnet.py +0 -1
  1172. transformers/models/resnet/modeling_resnet.py +16 -13
  1173. transformers/models/roberta/configuration_roberta.py +0 -1
  1174. transformers/models/roberta/modeling_roberta.py +94 -93
  1175. transformers/models/roberta/modular_roberta.py +58 -58
  1176. transformers/models/roberta/tokenization_roberta.py +2 -5
  1177. transformers/models/roberta/tokenization_roberta_old.py +2 -4
  1178. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +0 -1
  1179. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +94 -93
  1180. transformers/models/roc_bert/configuration_roc_bert.py +0 -1
  1181. transformers/models/roc_bert/modeling_roc_bert.py +122 -121
  1182. transformers/models/roc_bert/tokenization_roc_bert.py +88 -94
  1183. transformers/models/roformer/configuration_roformer.py +0 -1
  1184. transformers/models/roformer/modeling_roformer.py +79 -81
  1185. transformers/models/roformer/tokenization_roformer.py +3 -6
  1186. transformers/models/roformer/tokenization_utils.py +0 -1
  1187. transformers/models/rt_detr/configuration_rt_detr.py +1 -2
  1188. transformers/models/rt_detr/configuration_rt_detr_resnet.py +0 -1
  1189. transformers/models/rt_detr/image_processing_rt_detr.py +54 -55
  1190. transformers/models/rt_detr/image_processing_rt_detr_fast.py +15 -15
  1191. transformers/models/rt_detr/modeling_rt_detr.py +84 -82
  1192. transformers/models/rt_detr/modeling_rt_detr_resnet.py +10 -7
  1193. transformers/models/rt_detr/modular_rt_detr.py +14 -14
  1194. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -4
  1195. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +86 -81
  1196. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +10 -7
  1197. transformers/models/rwkv/configuration_rwkv.py +0 -1
  1198. transformers/models/rwkv/modeling_rwkv.py +30 -32
  1199. transformers/models/sam/configuration_sam.py +1 -1
  1200. transformers/models/sam/image_processing_sam.py +59 -60
  1201. transformers/models/sam/image_processing_sam_fast.py +21 -23
  1202. transformers/models/sam/modeling_sam.py +37 -36
  1203. transformers/models/sam/processing_sam.py +39 -27
  1204. transformers/models/sam2/configuration_sam2.py +1 -2
  1205. transformers/models/sam2/image_processing_sam2_fast.py +14 -15
  1206. transformers/models/sam2/modeling_sam2.py +50 -48
  1207. transformers/models/sam2/modular_sam2.py +48 -45
  1208. transformers/models/sam2/processing_sam2.py +31 -47
  1209. transformers/models/sam2_video/configuration_sam2_video.py +0 -1
  1210. transformers/models/sam2_video/modeling_sam2_video.py +119 -112
  1211. transformers/models/sam2_video/modular_sam2_video.py +91 -97
  1212. transformers/models/sam2_video/processing_sam2_video.py +49 -66
  1213. transformers/models/sam2_video/video_processing_sam2_video.py +1 -4
  1214. transformers/models/sam3/configuration_sam3.py +21 -2
  1215. transformers/models/sam3/image_processing_sam3_fast.py +17 -20
  1216. transformers/models/sam3/modeling_sam3.py +77 -56
  1217. transformers/models/sam3/modular_sam3.py +3 -8
  1218. transformers/models/sam3/processing_sam3.py +29 -48
  1219. transformers/models/sam3_tracker/__init__.py +0 -1
  1220. transformers/models/sam3_tracker/configuration_sam3_tracker.py +0 -1
  1221. transformers/models/sam3_tracker/modeling_sam3_tracker.py +36 -36
  1222. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -1
  1223. transformers/models/sam3_tracker/processing_sam3_tracker.py +31 -47
  1224. transformers/models/sam3_tracker_video/__init__.py +0 -1
  1225. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -1
  1226. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +96 -85
  1227. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +27 -6
  1228. transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +50 -66
  1229. transformers/models/sam3_video/configuration_sam3_video.py +14 -1
  1230. transformers/models/sam3_video/modeling_sam3_video.py +32 -34
  1231. transformers/models/sam3_video/processing_sam3_video.py +26 -46
  1232. transformers/models/sam_hq/__init__.py +1 -1
  1233. transformers/models/sam_hq/configuration_sam_hq.py +1 -1
  1234. transformers/models/sam_hq/modeling_sam_hq.py +65 -64
  1235. transformers/models/sam_hq/modular_sam_hq.py +17 -19
  1236. transformers/models/sam_hq/{processing_samhq.py → processing_sam_hq.py} +39 -28
  1237. transformers/models/seamless_m4t/configuration_seamless_m4t.py +0 -1
  1238. transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +8 -11
  1239. transformers/models/seamless_m4t/modeling_seamless_m4t.py +207 -193
  1240. transformers/models/seamless_m4t/processing_seamless_m4t.py +18 -39
  1241. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +15 -20
  1242. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +0 -1
  1243. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +199 -195
  1244. transformers/models/seed_oss/configuration_seed_oss.py +23 -25
  1245. transformers/models/seed_oss/modeling_seed_oss.py +31 -33
  1246. transformers/models/seed_oss/modular_seed_oss.py +3 -4
  1247. transformers/models/segformer/configuration_segformer.py +0 -10
  1248. transformers/models/segformer/image_processing_segformer.py +39 -42
  1249. transformers/models/segformer/image_processing_segformer_fast.py +7 -9
  1250. transformers/models/segformer/modeling_segformer.py +26 -28
  1251. transformers/models/segformer/modular_segformer.py +5 -7
  1252. transformers/models/seggpt/configuration_seggpt.py +0 -1
  1253. transformers/models/seggpt/image_processing_seggpt.py +38 -41
  1254. transformers/models/seggpt/modeling_seggpt.py +28 -30
  1255. transformers/models/sew/configuration_sew.py +0 -1
  1256. transformers/models/sew/modeling_sew.py +33 -35
  1257. transformers/models/sew/modular_sew.py +10 -12
  1258. transformers/models/sew_d/configuration_sew_d.py +0 -1
  1259. transformers/models/sew_d/modeling_sew_d.py +28 -30
  1260. transformers/models/shieldgemma2/configuration_shieldgemma2.py +0 -1
  1261. transformers/models/shieldgemma2/modeling_shieldgemma2.py +16 -17
  1262. transformers/models/shieldgemma2/processing_shieldgemma2.py +3 -5
  1263. transformers/models/siglip/configuration_siglip.py +0 -1
  1264. transformers/models/siglip/image_processing_siglip.py +17 -20
  1265. transformers/models/siglip/image_processing_siglip_fast.py +0 -1
  1266. transformers/models/siglip/modeling_siglip.py +62 -41
  1267. transformers/models/siglip/processing_siglip.py +2 -14
  1268. transformers/models/siglip/tokenization_siglip.py +6 -7
  1269. transformers/models/siglip2/configuration_siglip2.py +1 -1
  1270. transformers/models/siglip2/image_processing_siglip2.py +15 -16
  1271. transformers/models/siglip2/image_processing_siglip2_fast.py +4 -5
  1272. transformers/models/siglip2/modeling_siglip2.py +114 -92
  1273. transformers/models/siglip2/modular_siglip2.py +23 -25
  1274. transformers/models/siglip2/processing_siglip2.py +2 -14
  1275. transformers/models/smollm3/configuration_smollm3.py +23 -26
  1276. transformers/models/smollm3/modeling_smollm3.py +32 -35
  1277. transformers/models/smollm3/modular_smollm3.py +27 -29
  1278. transformers/models/smolvlm/configuration_smolvlm.py +1 -1
  1279. transformers/models/smolvlm/image_processing_smolvlm.py +42 -43
  1280. transformers/models/smolvlm/image_processing_smolvlm_fast.py +12 -12
  1281. transformers/models/smolvlm/modeling_smolvlm.py +56 -53
  1282. transformers/models/smolvlm/modular_smolvlm.py +15 -17
  1283. transformers/models/smolvlm/processing_smolvlm.py +15 -76
  1284. transformers/models/smolvlm/video_processing_smolvlm.py +7 -9
  1285. transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +0 -1
  1286. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +20 -23
  1287. transformers/models/speech_to_text/configuration_speech_to_text.py +0 -1
  1288. transformers/models/speech_to_text/feature_extraction_speech_to_text.py +10 -13
  1289. transformers/models/speech_to_text/modeling_speech_to_text.py +62 -54
  1290. transformers/models/speech_to_text/processing_speech_to_text.py +4 -30
  1291. transformers/models/speech_to_text/tokenization_speech_to_text.py +5 -6
  1292. transformers/models/speecht5/configuration_speecht5.py +0 -1
  1293. transformers/models/speecht5/feature_extraction_speecht5.py +16 -37
  1294. transformers/models/speecht5/modeling_speecht5.py +200 -174
  1295. transformers/models/speecht5/number_normalizer.py +0 -1
  1296. transformers/models/speecht5/processing_speecht5.py +3 -37
  1297. transformers/models/speecht5/tokenization_speecht5.py +4 -5
  1298. transformers/models/splinter/configuration_splinter.py +0 -1
  1299. transformers/models/splinter/modeling_splinter.py +63 -59
  1300. transformers/models/splinter/tokenization_splinter.py +2 -4
  1301. transformers/models/squeezebert/configuration_squeezebert.py +0 -1
  1302. transformers/models/squeezebert/modeling_squeezebert.py +62 -62
  1303. transformers/models/squeezebert/tokenization_squeezebert.py +0 -1
  1304. transformers/models/stablelm/configuration_stablelm.py +20 -23
  1305. transformers/models/stablelm/modeling_stablelm.py +40 -43
  1306. transformers/models/starcoder2/configuration_starcoder2.py +19 -22
  1307. transformers/models/starcoder2/modeling_starcoder2.py +34 -37
  1308. transformers/models/starcoder2/modular_starcoder2.py +13 -15
  1309. transformers/models/superglue/configuration_superglue.py +3 -3
  1310. transformers/models/superglue/image_processing_superglue.py +15 -15
  1311. transformers/models/superglue/image_processing_superglue_fast.py +5 -7
  1312. transformers/models/superglue/modeling_superglue.py +32 -33
  1313. transformers/models/superpoint/image_processing_superpoint.py +15 -15
  1314. transformers/models/superpoint/image_processing_superpoint_fast.py +5 -7
  1315. transformers/models/superpoint/modeling_superpoint.py +13 -14
  1316. transformers/models/swiftformer/configuration_swiftformer.py +0 -1
  1317. transformers/models/swiftformer/modeling_swiftformer.py +16 -14
  1318. transformers/models/swin/configuration_swin.py +0 -1
  1319. transformers/models/swin/modeling_swin.py +74 -82
  1320. transformers/models/swin2sr/configuration_swin2sr.py +0 -1
  1321. transformers/models/swin2sr/image_processing_swin2sr.py +10 -13
  1322. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -6
  1323. transformers/models/swin2sr/modeling_swin2sr.py +75 -61
  1324. transformers/models/swinv2/configuration_swinv2.py +0 -1
  1325. transformers/models/swinv2/modeling_swinv2.py +96 -100
  1326. transformers/models/switch_transformers/configuration_switch_transformers.py +0 -1
  1327. transformers/models/switch_transformers/modeling_switch_transformers.py +34 -41
  1328. transformers/models/switch_transformers/modular_switch_transformers.py +31 -38
  1329. transformers/models/t5/configuration_t5.py +7 -2
  1330. transformers/models/t5/modeling_t5.py +76 -84
  1331. transformers/models/t5/tokenization_t5.py +1 -3
  1332. transformers/models/t5gemma/configuration_t5gemma.py +33 -34
  1333. transformers/models/t5gemma/modeling_t5gemma.py +97 -100
  1334. transformers/models/t5gemma/modular_t5gemma.py +117 -118
  1335. transformers/models/t5gemma2/configuration_t5gemma2.py +59 -96
  1336. transformers/models/t5gemma2/modeling_t5gemma2.py +109 -103
  1337. transformers/models/t5gemma2/modular_t5gemma2.py +375 -91
  1338. transformers/models/table_transformer/configuration_table_transformer.py +1 -2
  1339. transformers/models/table_transformer/modeling_table_transformer.py +47 -49
  1340. transformers/models/tapas/configuration_tapas.py +0 -1
  1341. transformers/models/tapas/modeling_tapas.py +64 -66
  1342. transformers/models/tapas/tokenization_tapas.py +115 -153
  1343. transformers/models/textnet/configuration_textnet.py +0 -1
  1344. transformers/models/textnet/image_processing_textnet.py +22 -25
  1345. transformers/models/textnet/image_processing_textnet_fast.py +5 -7
  1346. transformers/models/textnet/modeling_textnet.py +13 -14
  1347. transformers/models/time_series_transformer/configuration_time_series_transformer.py +5 -8
  1348. transformers/models/time_series_transformer/modeling_time_series_transformer.py +79 -81
  1349. transformers/models/timesfm/configuration_timesfm.py +0 -1
  1350. transformers/models/timesfm/modeling_timesfm.py +29 -19
  1351. transformers/models/timesfm/modular_timesfm.py +28 -18
  1352. transformers/models/timesformer/configuration_timesformer.py +0 -1
  1353. transformers/models/timesformer/modeling_timesformer.py +13 -16
  1354. transformers/models/timm_backbone/configuration_timm_backbone.py +0 -1
  1355. transformers/models/timm_backbone/modeling_timm_backbone.py +17 -15
  1356. transformers/models/timm_wrapper/configuration_timm_wrapper.py +5 -3
  1357. transformers/models/timm_wrapper/image_processing_timm_wrapper.py +4 -5
  1358. transformers/models/timm_wrapper/modeling_timm_wrapper.py +32 -28
  1359. transformers/models/trocr/configuration_trocr.py +0 -1
  1360. transformers/models/trocr/modeling_trocr.py +39 -42
  1361. transformers/models/trocr/processing_trocr.py +5 -25
  1362. transformers/models/tvp/configuration_tvp.py +5 -2
  1363. transformers/models/tvp/image_processing_tvp.py +50 -52
  1364. transformers/models/tvp/image_processing_tvp_fast.py +9 -10
  1365. transformers/models/tvp/modeling_tvp.py +25 -27
  1366. transformers/models/tvp/processing_tvp.py +2 -14
  1367. transformers/models/udop/configuration_udop.py +1 -1
  1368. transformers/models/udop/modeling_udop.py +63 -70
  1369. transformers/models/udop/processing_udop.py +7 -26
  1370. transformers/models/udop/tokenization_udop.py +80 -93
  1371. transformers/models/umt5/configuration_umt5.py +2 -3
  1372. transformers/models/umt5/modeling_umt5.py +80 -87
  1373. transformers/models/unispeech/configuration_unispeech.py +0 -1
  1374. transformers/models/unispeech/modeling_unispeech.py +47 -49
  1375. transformers/models/unispeech/modular_unispeech.py +20 -22
  1376. transformers/models/unispeech_sat/configuration_unispeech_sat.py +0 -1
  1377. transformers/models/unispeech_sat/modeling_unispeech_sat.py +63 -65
  1378. transformers/models/unispeech_sat/modular_unispeech_sat.py +21 -23
  1379. transformers/models/univnet/feature_extraction_univnet.py +14 -14
  1380. transformers/models/univnet/modeling_univnet.py +7 -8
  1381. transformers/models/upernet/configuration_upernet.py +0 -1
  1382. transformers/models/upernet/modeling_upernet.py +10 -13
  1383. transformers/models/vaultgemma/__init__.py +0 -1
  1384. transformers/models/vaultgemma/configuration_vaultgemma.py +24 -26
  1385. transformers/models/vaultgemma/modeling_vaultgemma.py +35 -37
  1386. transformers/models/vaultgemma/modular_vaultgemma.py +29 -31
  1387. transformers/models/video_llama_3/image_processing_video_llama_3.py +43 -42
  1388. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +8 -8
  1389. transformers/models/video_llama_3/modeling_video_llama_3.py +77 -66
  1390. transformers/models/video_llama_3/modular_video_llama_3.py +110 -112
  1391. transformers/models/video_llama_3/processing_video_llama_3.py +5 -39
  1392. transformers/models/video_llama_3/video_processing_video_llama_3.py +18 -18
  1393. transformers/models/video_llava/configuration_video_llava.py +0 -1
  1394. transformers/models/video_llava/image_processing_video_llava.py +35 -38
  1395. transformers/models/video_llava/modeling_video_llava.py +59 -57
  1396. transformers/models/video_llava/processing_video_llava.py +38 -78
  1397. transformers/models/video_llava/video_processing_video_llava.py +0 -1
  1398. transformers/models/videomae/configuration_videomae.py +0 -1
  1399. transformers/models/videomae/image_processing_videomae.py +31 -34
  1400. transformers/models/videomae/modeling_videomae.py +13 -15
  1401. transformers/models/videomae/video_processing_videomae.py +0 -1
  1402. transformers/models/vilt/configuration_vilt.py +2 -3
  1403. transformers/models/vilt/image_processing_vilt.py +29 -30
  1404. transformers/models/vilt/image_processing_vilt_fast.py +9 -10
  1405. transformers/models/vilt/modeling_vilt.py +83 -78
  1406. transformers/models/vilt/processing_vilt.py +2 -14
  1407. transformers/models/vipllava/configuration_vipllava.py +0 -1
  1408. transformers/models/vipllava/modeling_vipllava.py +45 -42
  1409. transformers/models/vipllava/modular_vipllava.py +30 -32
  1410. transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +0 -1
  1411. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +18 -21
  1412. transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +0 -1
  1413. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +18 -21
  1414. transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +2 -16
  1415. transformers/models/visual_bert/configuration_visual_bert.py +0 -1
  1416. transformers/models/visual_bert/modeling_visual_bert.py +92 -92
  1417. transformers/models/vit/configuration_vit.py +0 -1
  1418. transformers/models/vit/image_processing_vit.py +19 -22
  1419. transformers/models/vit/image_processing_vit_fast.py +0 -1
  1420. transformers/models/vit/modeling_vit.py +13 -15
  1421. transformers/models/vit_mae/configuration_vit_mae.py +0 -1
  1422. transformers/models/vit_mae/modeling_vit_mae.py +21 -23
  1423. transformers/models/vit_msn/configuration_vit_msn.py +0 -1
  1424. transformers/models/vit_msn/modeling_vit_msn.py +10 -12
  1425. transformers/models/vitdet/configuration_vitdet.py +0 -1
  1426. transformers/models/vitdet/modeling_vitdet.py +12 -14
  1427. transformers/models/vitmatte/configuration_vitmatte.py +2 -5
  1428. transformers/models/vitmatte/image_processing_vitmatte.py +15 -18
  1429. transformers/models/vitmatte/image_processing_vitmatte_fast.py +14 -16
  1430. transformers/models/vitmatte/modeling_vitmatte.py +13 -11
  1431. transformers/models/vitpose/configuration_vitpose.py +4 -7
  1432. transformers/models/vitpose/image_processing_vitpose.py +24 -25
  1433. transformers/models/vitpose/image_processing_vitpose_fast.py +9 -11
  1434. transformers/models/vitpose/modeling_vitpose.py +10 -12
  1435. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +0 -1
  1436. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +8 -10
  1437. transformers/models/vits/configuration_vits.py +0 -1
  1438. transformers/models/vits/modeling_vits.py +34 -35
  1439. transformers/models/vits/tokenization_vits.py +3 -4
  1440. transformers/models/vivit/configuration_vivit.py +0 -1
  1441. transformers/models/vivit/image_processing_vivit.py +36 -39
  1442. transformers/models/vivit/modeling_vivit.py +5 -7
  1443. transformers/models/vjepa2/__init__.py +0 -1
  1444. transformers/models/vjepa2/configuration_vjepa2.py +0 -1
  1445. transformers/models/vjepa2/modeling_vjepa2.py +30 -32
  1446. transformers/models/vjepa2/video_processing_vjepa2.py +0 -1
  1447. transformers/models/voxtral/__init__.py +0 -1
  1448. transformers/models/voxtral/configuration_voxtral.py +0 -1
  1449. transformers/models/voxtral/modeling_voxtral.py +19 -27
  1450. transformers/models/voxtral/modular_voxtral.py +12 -21
  1451. transformers/models/voxtral/processing_voxtral.py +25 -48
  1452. transformers/models/wav2vec2/configuration_wav2vec2.py +0 -1
  1453. transformers/models/wav2vec2/feature_extraction_wav2vec2.py +7 -10
  1454. transformers/models/wav2vec2/modeling_wav2vec2.py +67 -122
  1455. transformers/models/wav2vec2/processing_wav2vec2.py +6 -35
  1456. transformers/models/wav2vec2/tokenization_wav2vec2.py +20 -332
  1457. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +0 -1
  1458. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +65 -62
  1459. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +52 -48
  1460. transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +6 -35
  1461. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +0 -1
  1462. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +84 -77
  1463. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +37 -30
  1464. transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +16 -17
  1465. transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +36 -55
  1466. transformers/models/wavlm/configuration_wavlm.py +0 -1
  1467. transformers/models/wavlm/modeling_wavlm.py +45 -48
  1468. transformers/models/wavlm/modular_wavlm.py +4 -5
  1469. transformers/models/whisper/configuration_whisper.py +0 -1
  1470. transformers/models/whisper/english_normalizer.py +3 -4
  1471. transformers/models/whisper/feature_extraction_whisper.py +9 -24
  1472. transformers/models/whisper/generation_whisper.py +27 -48
  1473. transformers/models/whisper/modeling_whisper.py +73 -73
  1474. transformers/models/whisper/processing_whisper.py +3 -20
  1475. transformers/models/whisper/tokenization_whisper.py +9 -30
  1476. transformers/models/x_clip/configuration_x_clip.py +0 -1
  1477. transformers/models/x_clip/modeling_x_clip.py +70 -69
  1478. transformers/models/x_clip/processing_x_clip.py +2 -14
  1479. transformers/models/xcodec/configuration_xcodec.py +4 -6
  1480. transformers/models/xcodec/modeling_xcodec.py +20 -17
  1481. transformers/models/xglm/configuration_xglm.py +0 -1
  1482. transformers/models/xglm/modeling_xglm.py +59 -55
  1483. transformers/models/xglm/tokenization_xglm.py +1 -4
  1484. transformers/models/xlm/configuration_xlm.py +0 -1
  1485. transformers/models/xlm/modeling_xlm.py +139 -144
  1486. transformers/models/xlm/tokenization_xlm.py +3 -5
  1487. transformers/models/xlm_roberta/configuration_xlm_roberta.py +0 -1
  1488. transformers/models/xlm_roberta/modeling_xlm_roberta.py +195 -194
  1489. transformers/models/xlm_roberta/modular_xlm_roberta.py +50 -53
  1490. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +1 -4
  1491. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +0 -1
  1492. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +94 -93
  1493. transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +67 -70
  1494. transformers/models/xlnet/configuration_xlnet.py +0 -11
  1495. transformers/models/xlnet/modeling_xlnet.py +152 -163
  1496. transformers/models/xlnet/tokenization_xlnet.py +1 -4
  1497. transformers/models/xlstm/configuration_xlstm.py +3 -5
  1498. transformers/models/xlstm/modeling_xlstm.py +62 -65
  1499. transformers/models/xmod/configuration_xmod.py +0 -1
  1500. transformers/models/xmod/modeling_xmod.py +101 -100
  1501. transformers/models/yolos/configuration_yolos.py +0 -1
  1502. transformers/models/yolos/image_processing_yolos.py +60 -62
  1503. transformers/models/yolos/image_processing_yolos_fast.py +18 -18
  1504. transformers/models/yolos/modeling_yolos.py +12 -14
  1505. transformers/models/yolos/modular_yolos.py +2 -4
  1506. transformers/models/yoso/configuration_yoso.py +0 -1
  1507. transformers/models/yoso/modeling_yoso.py +64 -63
  1508. transformers/models/zamba/configuration_zamba.py +0 -1
  1509. transformers/models/zamba/modeling_zamba.py +70 -70
  1510. transformers/models/zamba2/configuration_zamba2.py +36 -37
  1511. transformers/models/zamba2/modeling_zamba2.py +87 -89
  1512. transformers/models/zamba2/modular_zamba2.py +43 -45
  1513. transformers/models/zoedepth/configuration_zoedepth.py +1 -2
  1514. transformers/models/zoedepth/image_processing_zoedepth.py +28 -29
  1515. transformers/models/zoedepth/image_processing_zoedepth_fast.py +12 -15
  1516. transformers/models/zoedepth/modeling_zoedepth.py +21 -16
  1517. transformers/pipelines/__init__.py +59 -55
  1518. transformers/pipelines/any_to_any.py +14 -22
  1519. transformers/pipelines/audio_utils.py +1 -2
  1520. transformers/pipelines/automatic_speech_recognition.py +20 -12
  1521. transformers/pipelines/base.py +13 -17
  1522. transformers/pipelines/deprecated/__init__.py +0 -1
  1523. transformers/pipelines/document_question_answering.py +1 -1
  1524. transformers/pipelines/image_text_to_text.py +0 -1
  1525. transformers/pipelines/image_to_text.py +4 -44
  1526. transformers/pipelines/question_answering.py +5 -44
  1527. transformers/pipelines/text_classification.py +1 -14
  1528. transformers/pipelines/text_to_audio.py +2 -2
  1529. transformers/pipelines/token_classification.py +1 -22
  1530. transformers/pipelines/video_classification.py +1 -9
  1531. transformers/pipelines/zero_shot_audio_classification.py +0 -1
  1532. transformers/pipelines/zero_shot_classification.py +0 -6
  1533. transformers/pipelines/zero_shot_image_classification.py +0 -7
  1534. transformers/processing_utils.py +222 -151
  1535. transformers/quantizers/auto.py +2 -4
  1536. transformers/quantizers/base.py +19 -64
  1537. transformers/quantizers/quantizer_aqlm.py +1 -18
  1538. transformers/quantizers/quantizer_auto_round.py +1 -10
  1539. transformers/quantizers/quantizer_awq.py +3 -8
  1540. transformers/quantizers/quantizer_bitnet.py +1 -6
  1541. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  1542. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  1543. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  1544. transformers/quantizers/quantizer_eetq.py +2 -12
  1545. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  1546. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  1547. transformers/quantizers/quantizer_fp_quant.py +4 -4
  1548. transformers/quantizers/quantizer_gptq.py +1 -4
  1549. transformers/quantizers/quantizer_higgs.py +2 -6
  1550. transformers/quantizers/quantizer_mxfp4.py +2 -28
  1551. transformers/quantizers/quantizer_quanto.py +14 -14
  1552. transformers/quantizers/quantizer_quark.py +0 -1
  1553. transformers/quantizers/quantizer_spqr.py +3 -8
  1554. transformers/quantizers/quantizer_torchao.py +31 -127
  1555. transformers/quantizers/quantizer_vptq.py +1 -10
  1556. transformers/testing_utils.py +31 -49
  1557. transformers/tokenization_mistral_common.py +554 -902
  1558. transformers/tokenization_utils_base.py +112 -124
  1559. transformers/tokenization_utils_sentencepiece.py +5 -6
  1560. transformers/tokenization_utils_tokenizers.py +30 -7
  1561. transformers/trainer.py +30 -11
  1562. transformers/trainer_callback.py +8 -0
  1563. transformers/trainer_jit_checkpoint.py +1 -2
  1564. transformers/trainer_seq2seq.py +4 -0
  1565. transformers/training_args.py +11 -13
  1566. transformers/utils/__init__.py +4 -0
  1567. transformers/utils/attention_visualizer.py +5 -5
  1568. transformers/utils/auto_docstring.py +598 -37
  1569. transformers/utils/doc.py +1 -1
  1570. transformers/utils/dummy_pt_objects.py +0 -42
  1571. transformers/utils/generic.py +21 -1
  1572. transformers/utils/import_utils.py +51 -9
  1573. transformers/utils/kernel_config.py +71 -18
  1574. transformers/utils/loading_report.py +3 -3
  1575. transformers/utils/quantization_config.py +16 -18
  1576. transformers/video_processing_utils.py +35 -32
  1577. transformers/video_utils.py +18 -22
  1578. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/METADATA +23 -24
  1579. transformers-5.0.0rc3.dist-info/RECORD +2067 -0
  1580. transformers-5.0.0rc1.dist-info/RECORD +0 -2003
  1581. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/WHEEL +0 -0
  1582. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/entry_points.txt +0 -0
  1583. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/licenses/LICENSE +0 -0
  1584. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,3 @@
1
- # coding=utf-8
2
1
  # Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
3
2
  # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
4
3
  #
@@ -19,12 +18,12 @@ import inspect
19
18
  import os
20
19
  import warnings
21
20
  from collections.abc import Callable
21
+ from contextlib import contextmanager
22
22
  from dataclasses import dataclass
23
- from typing import TYPE_CHECKING, Any, Optional, Union
23
+ from typing import TYPE_CHECKING, Any, Optional
24
24
 
25
25
  import torch
26
26
  import torch.distributed as dist
27
- from packaging import version
28
27
  from torch import nn
29
28
 
30
29
  from ..cache_utils import (
@@ -332,9 +331,9 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput):
332
331
 
333
332
 
334
333
  # Typing shortcuts
335
- GenerateNonBeamOutput = Union[GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput]
336
- GenerateBeamOutput = Union[GenerateBeamDecoderOnlyOutput, GenerateBeamEncoderDecoderOutput]
337
- GenerateOutput = Union[GenerateNonBeamOutput, GenerateBeamOutput]
334
+ GenerateNonBeamOutput = GenerateDecoderOnlyOutput | GenerateEncoderDecoderOutput
335
+ GenerateBeamOutput = GenerateBeamDecoderOnlyOutput | GenerateBeamEncoderDecoderOutput
336
+ GenerateOutput = GenerateNonBeamOutput | GenerateBeamOutput
338
337
 
339
338
 
340
339
  class GenerationMixin(ContinuousMixin):
@@ -407,6 +406,9 @@ class GenerationMixin(ContinuousMixin):
407
406
  **repo_loading_kwargs,
408
407
  )
409
408
  except OSError:
409
+ # `self` already has a generation config created from model config, but model config will
410
+ # not contain any generation-specific params. These are popped at config's `__init__`.
411
+ # Thus we have to load from `config.json` and create a generation config from it (for BART)
410
412
  logger.info(
411
413
  "Generation config file not found, using a generation config created from the model config."
412
414
  )
@@ -418,6 +420,7 @@ class GenerationMixin(ContinuousMixin):
418
420
  _from_model_config=True,
419
421
  **repo_loading_kwargs,
420
422
  )
423
+
421
424
  # Load custom generate function if `pretrained_model_name_or_path` defines it (and override `generate`)
422
425
  if hasattr(self, "load_custom_generate") and trust_remote_code:
423
426
  try:
@@ -593,6 +596,7 @@ class GenerationMixin(ContinuousMixin):
593
596
  attention_mask: torch.LongTensor | None = None,
594
597
  inputs_embeds: torch.FloatTensor | None = None,
595
598
  cache_position: torch.LongTensor | None = None,
599
+ is_first_iteration: bool | None = False,
596
600
  **kwargs,
597
601
  ):
598
602
  """
@@ -628,7 +632,7 @@ class GenerationMixin(ContinuousMixin):
628
632
  input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
629
633
  # if `inputs_embeds` are passed, we only want to use them in the 1st generation step for every prompt.
630
634
  if not self.config.is_encoder_decoder:
631
- if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
635
+ if inputs_embeds is not None and is_first_iteration:
632
636
  model_inputs[input_ids_key] = None
633
637
  model_inputs["inputs_embeds"] = inputs_embeds
634
638
  else:
@@ -708,6 +712,7 @@ class GenerationMixin(ContinuousMixin):
708
712
  past_key_values=past_key_values,
709
713
  position_ids=position_ids,
710
714
  token_type_ids=token_type_ids,
715
+ is_first_iteration=is_first_iteration,
711
716
  )
712
717
  else:
713
718
  attention_mask = causal_mask_creation_function(
@@ -1300,7 +1305,7 @@ class GenerationMixin(ContinuousMixin):
1300
1305
  if generation_config.do_sample:
1301
1306
  # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
1302
1307
  # better score (i.e. keep len(list(generation_config._eos_token_tensor)) + 1)
1303
- if generation_config.num_beams > 1:
1308
+ if generation_config.num_beams is not None and generation_config.num_beams > 1:
1304
1309
  if isinstance(generation_config._eos_token_tensor, list):
1305
1310
  min_tokens_to_keep = len(generation_config._eos_token_tensor) + 1
1306
1311
  elif isinstance(generation_config._eos_token_tensor, torch.Tensor):
@@ -1722,8 +1727,8 @@ class GenerationMixin(ContinuousMixin):
1722
1727
  )
1723
1728
  generation_config.max_length = generation_config.max_new_tokens + input_ids_length
1724
1729
 
1725
- # if both `inputs_embeds` and `input_ids` are passed, we do not correct the length
1726
- # otherwise we need total length [inputs-embeds-len + new-tokens-len] to not go beyond indicated `max_length``
1730
+ # If both `inputs_embeds` and `input_ids` are passed, we correct length with `inputs_tensor.shape`
1731
+ # We need to get max_length = inputs_embeds_len + max_new_tokens
1727
1732
  elif (
1728
1733
  model_input_name == "inputs_embeds"
1729
1734
  and input_ids_length != inputs_tensor.shape[1]
@@ -1731,11 +1736,10 @@ class GenerationMixin(ContinuousMixin):
1731
1736
  ):
1732
1737
  generation_config.max_length -= inputs_tensor.shape[1]
1733
1738
  elif has_default_max_length: # by default let's always generate 20 new tokens
1734
- if generation_config.max_length == GenerationConfig().max_length:
1735
- generation_config.max_length = generation_config.max_length + input_ids_length
1736
- max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
1737
- if max_position_embeddings is not None:
1738
- generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
1739
+ generation_config.max_length = generation_config.max_length + input_ids_length
1740
+ max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
1741
+ if max_position_embeddings is not None:
1742
+ generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
1739
1743
 
1740
1744
  # same for min length
1741
1745
  if generation_config.min_new_tokens is not None:
@@ -1760,7 +1764,6 @@ class GenerationMixin(ContinuousMixin):
1760
1764
  def _prepare_generation_config(
1761
1765
  self,
1762
1766
  generation_config: GenerationConfig | None,
1763
- use_model_defaults: bool | None = None,
1764
1767
  **kwargs: Any,
1765
1768
  ) -> tuple[GenerationConfig, dict]:
1766
1769
  """
@@ -1768,93 +1771,46 @@ class GenerationMixin(ContinuousMixin):
1768
1771
  function handles retrocompatibility with respect to configuration files.
1769
1772
  """
1770
1773
  # parameterization priority:
1771
- # kwargs > non-global default values in `generation_config` > `model.generation_config` > GenerationConfig()
1774
+ # user-defined kwargs or `generation_config` > `self.generation_config` > global default values
1775
+ # TODO: (raushan) doesn't make sense to allow kwargs and `generation_config`. Should be mutually exclusive!
1772
1776
  # TODO (joao): per-model generation config classes.
1773
1777
 
1774
- using_model_generation_config = False
1775
1778
  if generation_config is None:
1776
- # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
1777
- # the following conditions must be met
1778
- # 1) the generation config must have been created from the model config (`_from_model_config` field);
1779
- # 2) the generation config must have seen no modification since its creation (the hash is the same);
1780
- # 3) there are non-default generation parameters in the model config.
1781
- # 4) the user must have set new generation parameters in the model config.
1782
- if (
1783
- self.generation_config._from_model_config # 1)
1784
- and self.generation_config._original_object_hash == hash(self.generation_config) # 2)
1785
- and len(self.config._get_non_default_generation_parameters()) > 0 # 3)
1786
- ):
1787
- new_generation_config = GenerationConfig.from_model_config(self.config)
1788
- if new_generation_config != self.generation_config: # 4)
1789
- raise ValueError(
1790
- "You have modified the pretrained model configuration to control generation."
1791
- " This strategy to control generation is not supported anymore. "
1792
- " Please use and modify the model generation configuration (see"
1793
- " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
1794
- )
1795
-
1796
- generation_config = self.generation_config
1797
- using_model_generation_config = True
1798
-
1799
- # Related to #40039: prior to this PR, models with sliding window attention were forced to have
1800
- # `cache_implementation="hybrid"` (the static sliding window cache). For these models, we now want to use
1801
- # the dynamic sliding window cache by default, so we UNSET `cache_implementation` if it is a default value.
1802
- # (if we're inside this branch, then it is because we're using default values from the Hub)
1803
- if generation_config.cache_implementation == "hybrid":
1804
- generation_config.cache_implementation = None
1779
+ # Users may modify `model.config` to control generation. This is a legacy behavior and is not supported anymore
1780
+ if len(self.config._get_generation_parameters()) > 0:
1781
+ raise ValueError(
1782
+ "You have modified the pretrained model configuration to control generation "
1783
+ f"We detected the following values set - {self.config._get_generation_parameters()}. "
1784
+ "This strategy to control generation is not supported anymore. Please use and modify `model.generation_config` "
1785
+ "(see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
1786
+ )
1787
+ generation_config = GenerationConfig()
1805
1788
 
1806
1789
  # `torch.export.export` usually raises an exception if it is called
1807
1790
  # with ``strict=True``. deepcopy can only be processed if ``strict=False``.
1808
1791
  generation_config = copy.deepcopy(generation_config)
1809
1792
 
1810
- if not using_model_generation_config:
1811
- # If `generation_config` is provided:
1812
- # - `use_model_defaults`: let's fallback ALL default values to the model's generation config
1813
- # - otherwise: legacy behavior, let's just make sure we have the tokens defined
1814
- model_base_version = version.parse(version.parse(self.generation_config.transformers_version).base_version)
1815
- if use_model_defaults is True or (
1816
- use_model_defaults is None and model_base_version >= version.parse("4.50.0")
1817
- ):
1818
- modified_values = {}
1819
- global_default_generation_config = GenerationConfig()
1820
- model_generation_config = self.generation_config
1821
- # we iterate over the model's generation config: it may hold custom keys, which we'll want to copy
1822
- for key, model_gen_config_value in model_generation_config.__dict__.items():
1823
- if key.startswith("_") or key == "transformers_version": # metadata
1824
- continue
1825
- # Don't set `cache_implementation = 'hybrid'` from the model defaults, see #40135
1826
- if key == "cache_implementation" and model_generation_config.cache_implementation == "hybrid":
1827
- continue
1828
- global_default_value = getattr(global_default_generation_config, key, None)
1829
- custom_gen_config_value = getattr(generation_config, key, None)
1830
- if (
1831
- custom_gen_config_value == global_default_value
1832
- and model_gen_config_value != global_default_value
1833
- ):
1834
- modified_values[key] = model_gen_config_value
1835
- setattr(generation_config, key, model_gen_config_value)
1836
- # edge case: we may set `temperature=0.0` and `do_sample=False`, but the model defaults to
1837
- # `do_sample=True`
1838
- if generation_config.temperature == 0.0:
1839
- generation_config.do_sample = False
1840
- if use_model_defaults is None and len(modified_values) > 0:
1841
- logger.warning_once(
1842
- f"`generation_config` default values have been modified to match model-specific defaults: "
1843
- f"{modified_values}. If this is not desired, please set these values explicitly."
1844
- )
1845
- else:
1846
- if generation_config.bos_token_id is None:
1847
- generation_config.bos_token_id = self.generation_config.bos_token_id
1848
- if generation_config.eos_token_id is None:
1849
- generation_config.eos_token_id = self.generation_config.eos_token_id
1850
- if generation_config.pad_token_id is None:
1851
- generation_config.pad_token_id = self.generation_config.pad_token_id
1852
- if generation_config.decoder_start_token_id is None:
1853
- generation_config.decoder_start_token_id = self.generation_config.decoder_start_token_id
1854
-
1855
- # Finally, apply any passed kwargs
1793
+ # First set values from the loaded `self.generation_config`, then set default values (BC)
1794
+ #
1795
+ # Only update values that are `None`, i.e. these values were not explicitly set by users to `generate()`,
1796
+ # or values that are not present in the current config, i.e. custom entries that were set via `**kwargs`.
1797
+ # Thus we use the specific kwargs `defaults_only=True` (`None` values only) and `allow_custom_entries=True`
1798
+ # (custom entries are carried over).
1799
+ global_defaults = self.generation_config._get_default_generation_params()
1800
+ generation_config.update(**self.generation_config.to_dict(), defaults_only=True, allow_custom_entries=True)
1801
+ generation_config.update(**global_defaults, defaults_only=True)
1802
+
1803
+ # Finally, if there are any kwargs, update config with it -> highest priority at the end
1856
1804
  model_kwargs = generation_config.update(**kwargs)
1857
- # And keep in model_kwargs variable output controls
1805
+
1806
+ # Related to #40039: prior to this PR, models with sliding window attention were forced to have
1807
+ # `cache_implementation="hybrid"` (the static sliding window cache). For these models, we now want to use
1808
+ # the dynamic sliding window cache by default, so we UNSET `cache_implementation` if it is a default value.
1809
+ # (if we're inside this branch, then it is because we're using default values from the Hub)
1810
+ if generation_config.cache_implementation == "hybrid":
1811
+ generation_config.cache_implementation = None
1812
+
1813
+ # Finally keep output_xxx args in `model_kwargs` so it can be passed to `forward`
1858
1814
  output_attentions = generation_config.output_attentions
1859
1815
  output_hidden_states = generation_config.output_hidden_states
1860
1816
  model_kwargs.update({"output_attentions": output_attentions} if output_attentions else {})
@@ -1946,6 +1902,7 @@ class GenerationMixin(ContinuousMixin):
1946
1902
  # NOTE: remove xlnet/reformer when the models are deprecated, non-standard model architecture/cache name
1947
1903
  return not cls._is_stateful and all(
1948
1904
  special_model_name not in cls.__name__.lower()
1905
+ or "minimaxm2" in cls.__name__.lower() # name clash between minimax and minimax m2
1949
1906
  for special_model_name in [
1950
1907
  "reformer",
1951
1908
  "minimax",
@@ -2211,8 +2168,10 @@ class GenerationMixin(ContinuousMixin):
2211
2168
  "will be skipped."
2212
2169
  )
2213
2170
 
2214
- # Finally: if we can compile, disable tokenizers parallelism and check for FA2 + static cache
2171
+ if can_compile:
2172
+ # Finally: if we can compile, disable tokenizers parallelism
2215
2173
  os.environ["TOKENIZERS_PARALLELISM"] = "0"
2174
+
2216
2175
  # If we use FA2 and a static cache, we cannot compile with fullgraph
2217
2176
  if self.config._attn_implementation == "flash_attention_2":
2218
2177
  # only raise warning if the user passed an explicit compile-config
@@ -2225,6 +2184,22 @@ class GenerationMixin(ContinuousMixin):
2225
2184
 
2226
2185
  return can_compile
2227
2186
 
2187
+ @contextmanager
2188
+ def _optimize_model_for_decode(self):
2189
+ original_experts_implementation = self.config._experts_implementation
2190
+ if original_experts_implementation == "grouped_mm":
2191
+ logger.info_once(
2192
+ "We will be switching to 'batched_mm' for the decoding stage as it is much more performant than 'grouped_mm' on smaller inputs. "
2193
+ "If you experience any issues with this, please open an issue on the Hugging Face Transformers GitHub repository.",
2194
+ )
2195
+ self.set_experts_implementation("batched_mm")
2196
+
2197
+ try:
2198
+ yield
2199
+ finally:
2200
+ if original_experts_implementation == "grouped_mm":
2201
+ self.set_experts_implementation(original_experts_implementation)
2202
+
2228
2203
  def _get_deprecated_gen_repo(
2229
2204
  self,
2230
2205
  generation_mode: GenerationMode,
@@ -2294,7 +2269,6 @@ class GenerationMixin(ContinuousMixin):
2294
2269
  streamer: Optional["BaseStreamer"] = None,
2295
2270
  negative_prompt_ids: torch.Tensor | None = None,
2296
2271
  negative_prompt_attention_mask: torch.Tensor | None = None,
2297
- use_model_defaults: bool | None = None,
2298
2272
  custom_generate: str | Callable | None = None,
2299
2273
  **kwargs,
2300
2274
  ) -> GenerateOutput | torch.LongTensor:
@@ -2360,11 +2334,6 @@ class GenerationMixin(ContinuousMixin):
2360
2334
  size. This is an experimental feature, subject to breaking API changes in future versions.
2361
2335
  negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
2362
2336
  Attention_mask for `negative_prompt_ids`.
2363
- use_model_defaults (`bool`, *optional*):
2364
- When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
2365
- generation configuration (`model.generation_config`), as opposed to the global defaults
2366
- (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
2367
- `True`.
2368
2337
  custom_generate (`str` or `Callable`, *optional*):
2369
2338
  One of the following:
2370
2339
  - `str` (Hugging Face Hub repository name): runs the custom `generate` function defined at
@@ -2449,7 +2418,7 @@ class GenerationMixin(ContinuousMixin):
2449
2418
  raise NotImplementedError(
2450
2419
  f"assistant_model is not supported for continuous batching. Got {assistant_model = }"
2451
2420
  )
2452
- if streamer is not None: # TODO: actualy this could be supported
2421
+ if streamer is not None: # TODO: actually this could be supported
2453
2422
  raise NotImplementedError(f"streaming is not supported for continuous batching. Got {streamer = }")
2454
2423
  if negative_prompt_ids is not None:
2455
2424
  raise NotImplementedError(
@@ -2474,7 +2443,7 @@ class GenerationMixin(ContinuousMixin):
2474
2443
  # switch to CB
2475
2444
  outputs = self.generate_batch(
2476
2445
  inputs=inputs,
2477
- generation_config=self._prepare_generation_config(generation_config, use_model_defaults, **kwargs)[0],
2446
+ generation_config=self._prepare_generation_config(generation_config, **kwargs)[0],
2478
2447
  **kwargs,
2479
2448
  )
2480
2449
  sequences = [
@@ -2495,13 +2464,21 @@ class GenerationMixin(ContinuousMixin):
2495
2464
  streamer,
2496
2465
  )
2497
2466
 
2498
- generation_config, model_kwargs = self._prepare_generation_config(
2499
- generation_config, use_model_defaults, **kwargs
2467
+ # Check length values before updating the config with defaults. We'll use it later to define the final min/max length (# 6)
2468
+ has_default_max_length = kwargs.get("max_length") is None and (
2469
+ generation_config is None or generation_config.max_length is None
2500
2470
  )
2471
+ has_default_min_length = kwargs.get("min_length") is None and (
2472
+ generation_config is None or generation_config.min_length is None
2473
+ )
2474
+ generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
2475
+
2501
2476
  generation_mode = generation_config.get_generation_mode(assistant_model)
2477
+ deprecated_mode_repo = self._get_deprecated_gen_repo(generation_mode, trust_remote_code, custom_generate)
2478
+
2502
2479
  if isinstance(custom_generate, Callable):
2503
2480
  decoding_method = custom_generate
2504
- else:
2481
+ elif deprecated_mode_repo is None:
2505
2482
  # type() required to access the unbound class-level method
2506
2483
  decoding_method = getattr(type(self), GENERATION_MODES_MAPPING[generation_mode])
2507
2484
 
@@ -2512,7 +2489,7 @@ class GenerationMixin(ContinuousMixin):
2512
2489
  # NOTE: This must come after initializing generation_config, since we need it to determine if this is a deprecated mode.
2513
2490
  # It must also be before any preparation steps, since Hub repos expect to be loaded before preparation steps.
2514
2491
  # TODO joao, manuel: remove this in v4.62.0
2515
- if deprecated_mode_repo := self._get_deprecated_gen_repo(generation_mode, trust_remote_code, custom_generate):
2492
+ if deprecated_mode_repo is not None:
2516
2493
  return GenerationMixin.generate(
2517
2494
  self,
2518
2495
  inputs=inputs,
@@ -2523,7 +2500,6 @@ class GenerationMixin(ContinuousMixin):
2523
2500
  assistant_model=assistant_model,
2524
2501
  negative_prompt_ids=negative_prompt_ids,
2525
2502
  negative_prompt_attention_mask=negative_prompt_attention_mask,
2526
- use_model_defaults=use_model_defaults,
2527
2503
  custom_generate=deprecated_mode_repo,
2528
2504
  trust_remote_code=trust_remote_code,
2529
2505
  **generation_mode_kwargs,
@@ -2614,8 +2590,6 @@ class GenerationMixin(ContinuousMixin):
2614
2590
 
2615
2591
  # 6. Prepare `max_length` depending on other stopping criteria.
2616
2592
  input_ids_length = input_ids.shape[1]
2617
- has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
2618
- has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
2619
2593
  generation_config = self._prepare_generated_length(
2620
2594
  generation_config=generation_config,
2621
2595
  has_default_max_length=has_default_max_length,
@@ -2873,13 +2847,20 @@ class GenerationMixin(ContinuousMixin):
2873
2847
  else self.__call__
2874
2848
  )
2875
2849
 
2876
- prefill_consumed = False
2877
- outputs = self._prefill(input_ids, generation_config, model_kwargs)
2850
+ # Assisted generation completes the prefill stage in candidate generator so that
2851
+ # we don't have several `prefill` calls in one generation loop. Skip `_prefill` for assistants
2852
+ if not generation_config.is_assistant:
2853
+ outputs = self._prefill(input_ids, generation_config, model_kwargs)
2854
+ prefill_consumed = False
2855
+ else:
2856
+ model_kwargs = self._get_initial_cache_position(input_ids.shape[1], input_ids.device, model_kwargs)
2857
+ prefill_consumed = True
2878
2858
 
2879
2859
  while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
2880
2860
  if prefill_consumed:
2881
2861
  model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
2882
- outputs = model_forward(**model_inputs, return_dict=True)
2862
+ with self._optimize_model_for_decode():
2863
+ outputs = model_forward(**model_inputs, return_dict=True)
2883
2864
  prefill_consumed = True
2884
2865
  model_kwargs = self._update_model_kwargs_for_generation(
2885
2866
  outputs,
@@ -3351,9 +3332,15 @@ class GenerationMixin(ContinuousMixin):
3351
3332
  )
3352
3333
  beam_indices = running_beam_indices.detach().clone()
3353
3334
 
3354
- prefill_consumed = False
3355
3335
  flat_running_sequences = input_ids
3356
- model_outputs = self._prefill(input_ids, generation_config, model_kwargs)
3336
+ # Assisted generation completes the prefill stage in candidate generator so that
3337
+ # we don't have several `prefill` calls in one generation loop. Skip `_prefill` for assistants
3338
+ if not generation_config.is_assistant:
3339
+ model_outputs = self._prefill(input_ids, generation_config, model_kwargs)
3340
+ prefill_consumed = False
3341
+ else:
3342
+ model_kwargs = self._get_initial_cache_position(input_ids.shape[1], input_ids.device, model_kwargs)
3343
+ prefill_consumed = True
3357
3344
 
3358
3345
  # 4. run the generation loop
3359
3346
  while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
@@ -3659,7 +3646,7 @@ class GenerationMixin(ContinuousMixin):
3659
3646
  cur_len = input_ids.shape[1]
3660
3647
 
3661
3648
  # 1. Fetch candidate sequences from a `CandidateGenerator` and move to the correct device
3662
- candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids)
3649
+ candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids, is_first_iteration)
3663
3650
  candidate_input_ids = candidate_input_ids.to(self.device)
3664
3651
  if candidate_logits is not None:
3665
3652
  candidate_logits = candidate_logits.to(self.device)
@@ -3686,7 +3673,9 @@ class GenerationMixin(ContinuousMixin):
3686
3673
  dim=0,
3687
3674
  )
3688
3675
 
3689
- model_inputs = self.prepare_inputs_for_generation(candidate_input_ids, **candidate_kwargs)
3676
+ model_inputs = self.prepare_inputs_for_generation(
3677
+ candidate_input_ids, is_first_iteration=is_first_iteration, **candidate_kwargs
3678
+ )
3690
3679
  if "logits_to_keep" in model_inputs:
3691
3680
  model_inputs["logits_to_keep"] = candidate_length + 1
3692
3681
 
@@ -3849,7 +3838,7 @@ class GenerationMixin(ContinuousMixin):
3849
3838
  def _prefill(self, input_ids: torch.LongTensor, generation_config: GenerationConfig, model_kwargs):
3850
3839
  if generation_config.prefill_chunk_size is None:
3851
3840
  model_kwargs = self._get_initial_cache_position(input_ids.shape[1], input_ids.device, model_kwargs)
3852
- model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
3841
+ model_inputs = self.prepare_inputs_for_generation(input_ids, is_first_iteration=True, **model_kwargs)
3853
3842
  return self(**model_inputs, return_dict=True)
3854
3843
  else: # Chunked prefill
3855
3844
  # Even if we are not compiling the forward, flex is always compiled when used. With chunked prefill, we may
@@ -3877,7 +3866,6 @@ class GenerationMixin(ContinuousMixin):
3877
3866
  model_kwargs["cache_position"] = torch.arange(
3878
3867
  past_length, current_length, dtype=torch.long, device=input_chunk.device
3879
3868
  )
3880
- model_kwargs["position_ids"] = model_kwargs["cache_position"].unsqueeze(0)
3881
3869
  model_inputs = self.prepare_inputs_for_generation(input_chunk, **model_kwargs)
3882
3870
 
3883
3871
  outputs = model_forward(**model_inputs, return_dict=True)
@@ -1,4 +1,3 @@
1
- # coding=utf-8
2
1
  # Copyright 2024 The HuggingFace Inc. team and Google DeepMind.
3
2
  #
4
3
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,7 +15,7 @@
16
15
  import collections
17
16
  from dataclasses import dataclass
18
17
  from functools import lru_cache
19
- from typing import Any
18
+ from typing import TYPE_CHECKING, Any, Union
20
19
 
21
20
  import numpy as np
22
21
  import torch
@@ -24,12 +23,15 @@ from torch import nn
24
23
  from torch.nn import BCELoss
25
24
 
26
25
  from .. import initialization as init
26
+ from ..configuration_utils import PreTrainedConfig
27
27
  from ..modeling_utils import PreTrainedModel
28
28
  from ..utils import ModelOutput, logging
29
- from .configuration_utils import PreTrainedConfig, WatermarkingConfig
30
29
  from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor
31
30
 
32
31
 
32
+ if TYPE_CHECKING:
33
+ from .configuration_utils import WatermarkingConfig
34
+
33
35
  logger = logging.get_logger(__name__)
34
36
 
35
37
 
@@ -120,13 +122,13 @@ class WatermarkDetector:
120
122
 
121
123
  def __init__(
122
124
  self,
123
- model_config: PreTrainedConfig,
125
+ model_config: "PreTrainedConfig",
124
126
  device: str,
125
- watermarking_config: WatermarkingConfig | dict,
127
+ watermarking_config: Union["WatermarkingConfig", dict] | None,
126
128
  ignore_repeated_ngrams: bool = False,
127
129
  max_cache_size: int = 128,
128
130
  ):
129
- if isinstance(watermarking_config, WatermarkingConfig):
131
+ if not isinstance(watermarking_config, dict):
130
132
  watermarking_config = watermarking_config.to_dict()
131
133
 
132
134
  self.bos_token_id = (
@@ -23,7 +23,7 @@ from copy import copy
23
23
  from enum import Enum
24
24
  from inspect import isclass
25
25
  from pathlib import Path
26
- from typing import Any, Literal, NewType, Optional, Union, get_type_hints
26
+ from typing import Any, Literal, NewType, Union, get_type_hints
27
27
 
28
28
  import yaml
29
29
 
@@ -63,11 +63,11 @@ def make_choice_type_function(choices: list) -> Callable[[str], Any]:
63
63
 
64
64
  def HfArg(
65
65
  *,
66
- aliases: Optional[Union[str, list[str]]] = None,
67
- help: Optional[str] = None,
66
+ aliases: str | list[str] | None = None,
67
+ help: str | None = None,
68
68
  default: Any = dataclasses.MISSING,
69
69
  default_factory: Callable[[], Any] = dataclasses.MISSING,
70
- metadata: Optional[dict] = None,
70
+ metadata: dict | None = None,
71
71
  **kwargs,
72
72
  ) -> dataclasses.Field:
73
73
  """Argument helper enabling a concise syntax to create dataclass fields for parsing with `HfArgumentParser`.
@@ -125,7 +125,7 @@ class HfArgumentParser(ArgumentParser):
125
125
 
126
126
  dataclass_types: Iterable[DataClassType]
127
127
 
128
- def __init__(self, dataclass_types: Optional[Union[DataClassType, Iterable[DataClassType]]] = None, **kwargs):
128
+ def __init__(self, dataclass_types: DataClassType | Iterable[DataClassType] | None = None, **kwargs):
129
129
  # Make sure dataclass_types is an iterable
130
130
  if dataclass_types is None:
131
131
  dataclass_types = []
@@ -201,7 +201,7 @@ class HfArgumentParser(ArgumentParser):
201
201
  kwargs["default"] = field.default
202
202
  else:
203
203
  kwargs["required"] = True
204
- elif field.type is bool or field.type == Optional[bool]:
204
+ elif field.type is bool or field.type == bool | None:
205
205
  # Copy the correct kwargs to use to instantiate a `no_*` complement argument below.
206
206
  # We do not initialize it here because the `no_*` alternative must be instantiated after the real argument
207
207
  bool_kwargs = copy(kwargs)
@@ -238,7 +238,7 @@ class HfArgumentParser(ArgumentParser):
238
238
  # Order is important for arguments with the same destination!
239
239
  # We use a copy of earlier kwargs because the original kwargs have changed a lot before reaching down
240
240
  # here and we do not need those changes/additional keys.
241
- if field.default is True and (field.type is bool or field.type == Optional[bool]):
241
+ if field.default is True and (field.type is bool or field.type == bool | None):
242
242
  bool_kwargs["default"] = False
243
243
  parser.add_argument(
244
244
  f"--no_{field.name}",
@@ -383,9 +383,7 @@ class HfArgumentParser(ArgumentParser):
383
383
  raise ValueError(f"Some keys are not used by the HfArgumentParser: {sorted(unused_keys)}")
384
384
  return tuple(outputs)
385
385
 
386
- def parse_json_file(
387
- self, json_file: Union[str, os.PathLike], allow_extra_keys: bool = False
388
- ) -> tuple[DataClass, ...]:
386
+ def parse_json_file(self, json_file: str | os.PathLike, allow_extra_keys: bool = False) -> tuple[DataClass, ...]:
389
387
  """
390
388
  Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the
391
389
  dataclass types.
@@ -407,9 +405,7 @@ class HfArgumentParser(ArgumentParser):
407
405
  outputs = self.parse_dict(data, allow_extra_keys=allow_extra_keys)
408
406
  return tuple(outputs)
409
407
 
410
- def parse_yaml_file(
411
- self, yaml_file: Union[str, os.PathLike], allow_extra_keys: bool = False
412
- ) -> tuple[DataClass, ...]:
408
+ def parse_yaml_file(self, yaml_file: str | os.PathLike, allow_extra_keys: bool = False) -> tuple[DataClass, ...]:
413
409
  """
414
410
  Alternative helper method that does not use `argparse` at all, instead loading a yaml file and populating the
415
411
  dataclass types.
@@ -11,7 +11,6 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- from typing import Optional
15
14
 
16
15
  from .integrations import (
17
16
  is_optuna_available,
@@ -35,7 +34,7 @@ logger = logging.get_logger(__name__)
35
34
 
36
35
  class HyperParamSearchBackendBase:
37
36
  name: str
38
- pip_package: Optional[str] = None
37
+ pip_package: str | None = None
39
38
 
40
39
  @staticmethod
41
40
  def is_available():