transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1584) hide show
  1. transformers/__init__.py +27 -27
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +32 -33
  4. transformers/cache_utils.py +32 -139
  5. transformers/cli/chat.py +3 -3
  6. transformers/cli/serve.py +2 -2
  7. transformers/cli/transformers.py +2 -1
  8. transformers/configuration_utils.py +143 -101
  9. transformers/conversion_mapping.py +73 -6
  10. transformers/convert_slow_tokenizer.py +3 -8
  11. transformers/core_model_loading.py +215 -50
  12. transformers/data/processors/glue.py +0 -1
  13. transformers/data/processors/utils.py +0 -1
  14. transformers/data/processors/xnli.py +0 -1
  15. transformers/dependency_versions_table.py +5 -5
  16. transformers/distributed/configuration_utils.py +1 -2
  17. transformers/dynamic_module_utils.py +23 -23
  18. transformers/feature_extraction_sequence_utils.py +19 -23
  19. transformers/feature_extraction_utils.py +63 -31
  20. transformers/generation/candidate_generator.py +80 -33
  21. transformers/generation/configuration_utils.py +186 -131
  22. transformers/generation/continuous_batching/__init__.py +0 -1
  23. transformers/generation/continuous_batching/cache.py +81 -24
  24. transformers/generation/continuous_batching/cache_manager.py +155 -45
  25. transformers/generation/continuous_batching/continuous_api.py +152 -84
  26. transformers/generation/continuous_batching/requests.py +51 -3
  27. transformers/generation/continuous_batching/scheduler.py +127 -52
  28. transformers/generation/logits_process.py +0 -128
  29. transformers/generation/stopping_criteria.py +1 -1
  30. transformers/generation/streamers.py +0 -1
  31. transformers/generation/utils.py +107 -119
  32. transformers/generation/watermarking.py +8 -6
  33. transformers/hf_argparser.py +9 -13
  34. transformers/hyperparameter_search.py +1 -2
  35. transformers/image_processing_base.py +11 -21
  36. transformers/image_processing_utils.py +11 -12
  37. transformers/image_processing_utils_fast.py +68 -57
  38. transformers/image_transforms.py +29 -29
  39. transformers/image_utils.py +30 -32
  40. transformers/initialization.py +37 -0
  41. transformers/integrations/__init__.py +12 -0
  42. transformers/integrations/accelerate.py +44 -111
  43. transformers/integrations/aqlm.py +3 -5
  44. transformers/integrations/awq.py +3 -8
  45. transformers/integrations/bitnet.py +5 -8
  46. transformers/integrations/bitsandbytes.py +16 -15
  47. transformers/integrations/deepspeed.py +19 -4
  48. transformers/integrations/eetq.py +3 -6
  49. transformers/integrations/fbgemm_fp8.py +2 -3
  50. transformers/integrations/finegrained_fp8.py +14 -23
  51. transformers/integrations/flash_attention.py +2 -2
  52. transformers/integrations/flex_attention.py +1 -1
  53. transformers/integrations/fp_quant.py +4 -6
  54. transformers/integrations/ggml.py +0 -1
  55. transformers/integrations/higgs.py +2 -5
  56. transformers/integrations/hub_kernels.py +23 -5
  57. transformers/integrations/integration_utils.py +37 -3
  58. transformers/integrations/mistral.py +12 -0
  59. transformers/integrations/moe.py +240 -0
  60. transformers/integrations/mxfp4.py +9 -16
  61. transformers/integrations/peft.py +5 -0
  62. transformers/integrations/quanto.py +5 -2
  63. transformers/integrations/quark.py +2 -4
  64. transformers/integrations/spqr.py +3 -5
  65. transformers/integrations/tensor_parallel.py +167 -221
  66. transformers/integrations/torchao.py +4 -6
  67. transformers/integrations/vptq.py +3 -5
  68. transformers/loss/loss_lw_detr.py +356 -0
  69. transformers/loss/loss_utils.py +2 -0
  70. transformers/masking_utils.py +47 -51
  71. transformers/model_debugging_utils.py +4 -5
  72. transformers/modelcard.py +14 -192
  73. transformers/modeling_attn_mask_utils.py +19 -19
  74. transformers/modeling_flash_attention_utils.py +27 -27
  75. transformers/modeling_gguf_pytorch_utils.py +71 -24
  76. transformers/modeling_layers.py +21 -22
  77. transformers/modeling_outputs.py +242 -253
  78. transformers/modeling_rope_utils.py +110 -113
  79. transformers/modeling_utils.py +633 -576
  80. transformers/models/__init__.py +23 -0
  81. transformers/models/afmoe/configuration_afmoe.py +26 -29
  82. transformers/models/afmoe/modeling_afmoe.py +37 -49
  83. transformers/models/afmoe/modular_afmoe.py +21 -31
  84. transformers/models/aimv2/configuration_aimv2.py +2 -5
  85. transformers/models/aimv2/modeling_aimv2.py +24 -21
  86. transformers/models/aimv2/modular_aimv2.py +11 -9
  87. transformers/models/albert/configuration_albert.py +0 -1
  88. transformers/models/albert/modeling_albert.py +70 -69
  89. transformers/models/albert/tokenization_albert.py +1 -4
  90. transformers/models/align/configuration_align.py +0 -1
  91. transformers/models/align/modeling_align.py +73 -68
  92. transformers/models/align/processing_align.py +2 -30
  93. transformers/models/altclip/configuration_altclip.py +0 -1
  94. transformers/models/altclip/modeling_altclip.py +83 -80
  95. transformers/models/altclip/processing_altclip.py +2 -15
  96. transformers/models/apertus/__init__.py +0 -1
  97. transformers/models/apertus/configuration_apertus.py +18 -21
  98. transformers/models/apertus/modeling_apertus.py +35 -36
  99. transformers/models/apertus/modular_apertus.py +32 -31
  100. transformers/models/arcee/configuration_arcee.py +20 -23
  101. transformers/models/arcee/modeling_arcee.py +32 -35
  102. transformers/models/arcee/modular_arcee.py +20 -23
  103. transformers/models/aria/configuration_aria.py +20 -23
  104. transformers/models/aria/image_processing_aria.py +25 -27
  105. transformers/models/aria/modeling_aria.py +71 -70
  106. transformers/models/aria/modular_aria.py +85 -88
  107. transformers/models/aria/processing_aria.py +28 -35
  108. transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +0 -1
  109. transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +3 -6
  110. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +6 -8
  111. transformers/models/audioflamingo3/__init__.py +0 -1
  112. transformers/models/audioflamingo3/configuration_audioflamingo3.py +0 -1
  113. transformers/models/audioflamingo3/modeling_audioflamingo3.py +22 -23
  114. transformers/models/audioflamingo3/modular_audioflamingo3.py +12 -17
  115. transformers/models/audioflamingo3/processing_audioflamingo3.py +33 -30
  116. transformers/models/auto/auto_factory.py +5 -6
  117. transformers/models/auto/configuration_auto.py +53 -5
  118. transformers/models/auto/feature_extraction_auto.py +12 -10
  119. transformers/models/auto/image_processing_auto.py +17 -28
  120. transformers/models/auto/modeling_auto.py +38 -188
  121. transformers/models/auto/processing_auto.py +6 -1
  122. transformers/models/auto/tokenization_auto.py +147 -169
  123. transformers/models/auto/video_processing_auto.py +12 -10
  124. transformers/models/autoformer/configuration_autoformer.py +4 -7
  125. transformers/models/autoformer/modeling_autoformer.py +98 -100
  126. transformers/models/aya_vision/configuration_aya_vision.py +0 -1
  127. transformers/models/aya_vision/modeling_aya_vision.py +42 -40
  128. transformers/models/aya_vision/modular_aya_vision.py +26 -29
  129. transformers/models/aya_vision/processing_aya_vision.py +25 -53
  130. transformers/models/bamba/configuration_bamba.py +29 -32
  131. transformers/models/bamba/modeling_bamba.py +78 -83
  132. transformers/models/bamba/modular_bamba.py +68 -71
  133. transformers/models/bark/configuration_bark.py +4 -7
  134. transformers/models/bark/generation_configuration_bark.py +3 -5
  135. transformers/models/bark/modeling_bark.py +49 -55
  136. transformers/models/bark/processing_bark.py +19 -41
  137. transformers/models/bart/configuration_bart.py +0 -2
  138. transformers/models/bart/modeling_bart.py +122 -117
  139. transformers/models/barthez/tokenization_barthez.py +1 -4
  140. transformers/models/bartpho/tokenization_bartpho.py +6 -7
  141. transformers/models/beit/configuration_beit.py +0 -11
  142. transformers/models/beit/image_processing_beit.py +53 -56
  143. transformers/models/beit/image_processing_beit_fast.py +8 -10
  144. transformers/models/beit/modeling_beit.py +51 -53
  145. transformers/models/bert/configuration_bert.py +0 -1
  146. transformers/models/bert/modeling_bert.py +114 -122
  147. transformers/models/bert/tokenization_bert.py +2 -4
  148. transformers/models/bert/tokenization_bert_legacy.py +3 -5
  149. transformers/models/bert_generation/configuration_bert_generation.py +0 -1
  150. transformers/models/bert_generation/modeling_bert_generation.py +49 -49
  151. transformers/models/bert_generation/tokenization_bert_generation.py +2 -3
  152. transformers/models/bert_japanese/tokenization_bert_japanese.py +5 -6
  153. transformers/models/bertweet/tokenization_bertweet.py +1 -3
  154. transformers/models/big_bird/configuration_big_bird.py +0 -1
  155. transformers/models/big_bird/modeling_big_bird.py +110 -109
  156. transformers/models/big_bird/tokenization_big_bird.py +1 -4
  157. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +0 -1
  158. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +116 -111
  159. transformers/models/biogpt/configuration_biogpt.py +0 -1
  160. transformers/models/biogpt/modeling_biogpt.py +69 -71
  161. transformers/models/biogpt/modular_biogpt.py +59 -61
  162. transformers/models/biogpt/tokenization_biogpt.py +3 -5
  163. transformers/models/bit/configuration_bit.py +0 -1
  164. transformers/models/bit/image_processing_bit.py +21 -24
  165. transformers/models/bit/image_processing_bit_fast.py +0 -1
  166. transformers/models/bit/modeling_bit.py +14 -12
  167. transformers/models/bitnet/configuration_bitnet.py +18 -21
  168. transformers/models/bitnet/modeling_bitnet.py +32 -35
  169. transformers/models/bitnet/modular_bitnet.py +4 -6
  170. transformers/models/blenderbot/configuration_blenderbot.py +0 -1
  171. transformers/models/blenderbot/modeling_blenderbot.py +71 -95
  172. transformers/models/blenderbot/tokenization_blenderbot.py +6 -8
  173. transformers/models/blenderbot_small/configuration_blenderbot_small.py +0 -1
  174. transformers/models/blenderbot_small/modeling_blenderbot_small.py +73 -68
  175. transformers/models/blenderbot_small/tokenization_blenderbot_small.py +1 -3
  176. transformers/models/blip/configuration_blip.py +0 -1
  177. transformers/models/blip/image_processing_blip.py +17 -20
  178. transformers/models/blip/image_processing_blip_fast.py +0 -1
  179. transformers/models/blip/modeling_blip.py +62 -71
  180. transformers/models/blip/modeling_blip_text.py +71 -65
  181. transformers/models/blip/processing_blip.py +5 -36
  182. transformers/models/blip_2/configuration_blip_2.py +0 -1
  183. transformers/models/blip_2/modeling_blip_2.py +72 -71
  184. transformers/models/blip_2/processing_blip_2.py +8 -38
  185. transformers/models/bloom/configuration_bloom.py +0 -1
  186. transformers/models/bloom/modeling_bloom.py +71 -103
  187. transformers/models/blt/configuration_blt.py +71 -74
  188. transformers/models/blt/modeling_blt.py +235 -78
  189. transformers/models/blt/modular_blt.py +225 -62
  190. transformers/models/bridgetower/configuration_bridgetower.py +0 -1
  191. transformers/models/bridgetower/image_processing_bridgetower.py +34 -35
  192. transformers/models/bridgetower/image_processing_bridgetower_fast.py +7 -10
  193. transformers/models/bridgetower/modeling_bridgetower.py +113 -109
  194. transformers/models/bridgetower/processing_bridgetower.py +2 -16
  195. transformers/models/bros/configuration_bros.py +0 -1
  196. transformers/models/bros/modeling_bros.py +86 -80
  197. transformers/models/bros/processing_bros.py +2 -12
  198. transformers/models/byt5/tokenization_byt5.py +4 -6
  199. transformers/models/camembert/configuration_camembert.py +0 -1
  200. transformers/models/camembert/modeling_camembert.py +196 -195
  201. transformers/models/camembert/modular_camembert.py +51 -54
  202. transformers/models/camembert/tokenization_camembert.py +1 -4
  203. transformers/models/canine/configuration_canine.py +0 -1
  204. transformers/models/canine/modeling_canine.py +79 -75
  205. transformers/models/canine/tokenization_canine.py +2 -1
  206. transformers/models/chameleon/configuration_chameleon.py +24 -27
  207. transformers/models/chameleon/image_processing_chameleon.py +21 -24
  208. transformers/models/chameleon/image_processing_chameleon_fast.py +0 -1
  209. transformers/models/chameleon/modeling_chameleon.py +62 -60
  210. transformers/models/chameleon/processing_chameleon.py +16 -41
  211. transformers/models/chinese_clip/configuration_chinese_clip.py +0 -1
  212. transformers/models/chinese_clip/image_processing_chinese_clip.py +21 -24
  213. transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +0 -1
  214. transformers/models/chinese_clip/modeling_chinese_clip.py +71 -69
  215. transformers/models/chinese_clip/processing_chinese_clip.py +2 -15
  216. transformers/models/clap/configuration_clap.py +0 -1
  217. transformers/models/clap/feature_extraction_clap.py +11 -12
  218. transformers/models/clap/modeling_clap.py +113 -104
  219. transformers/models/clap/processing_clap.py +2 -15
  220. transformers/models/clip/configuration_clip.py +0 -1
  221. transformers/models/clip/image_processing_clip.py +21 -24
  222. transformers/models/clip/image_processing_clip_fast.py +0 -1
  223. transformers/models/clip/modeling_clip.py +47 -46
  224. transformers/models/clip/processing_clip.py +2 -14
  225. transformers/models/clip/tokenization_clip.py +2 -5
  226. transformers/models/clipseg/configuration_clipseg.py +0 -1
  227. transformers/models/clipseg/modeling_clipseg.py +90 -87
  228. transformers/models/clipseg/processing_clipseg.py +8 -39
  229. transformers/models/clvp/configuration_clvp.py +1 -3
  230. transformers/models/clvp/feature_extraction_clvp.py +7 -10
  231. transformers/models/clvp/modeling_clvp.py +133 -118
  232. transformers/models/clvp/number_normalizer.py +1 -2
  233. transformers/models/clvp/processing_clvp.py +3 -20
  234. transformers/models/clvp/tokenization_clvp.py +0 -1
  235. transformers/models/code_llama/tokenization_code_llama.py +4 -7
  236. transformers/models/codegen/configuration_codegen.py +0 -1
  237. transformers/models/codegen/modeling_codegen.py +61 -52
  238. transformers/models/codegen/tokenization_codegen.py +5 -6
  239. transformers/models/cohere/configuration_cohere.py +20 -23
  240. transformers/models/cohere/modeling_cohere.py +36 -39
  241. transformers/models/cohere/modular_cohere.py +24 -28
  242. transformers/models/cohere/tokenization_cohere.py +5 -6
  243. transformers/models/cohere2/configuration_cohere2.py +21 -24
  244. transformers/models/cohere2/modeling_cohere2.py +35 -38
  245. transformers/models/cohere2/modular_cohere2.py +39 -41
  246. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +6 -8
  247. transformers/models/cohere2_vision/modeling_cohere2_vision.py +35 -33
  248. transformers/models/cohere2_vision/modular_cohere2_vision.py +21 -23
  249. transformers/models/cohere2_vision/processing_cohere2_vision.py +6 -36
  250. transformers/models/colpali/configuration_colpali.py +0 -1
  251. transformers/models/colpali/modeling_colpali.py +14 -16
  252. transformers/models/colpali/modular_colpali.py +11 -51
  253. transformers/models/colpali/processing_colpali.py +14 -52
  254. transformers/models/colqwen2/modeling_colqwen2.py +20 -22
  255. transformers/models/colqwen2/modular_colqwen2.py +29 -68
  256. transformers/models/colqwen2/processing_colqwen2.py +16 -52
  257. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -2
  258. transformers/models/conditional_detr/image_processing_conditional_detr.py +64 -66
  259. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +22 -22
  260. transformers/models/conditional_detr/modeling_conditional_detr.py +82 -81
  261. transformers/models/conditional_detr/modular_conditional_detr.py +1 -3
  262. transformers/models/convbert/configuration_convbert.py +0 -1
  263. transformers/models/convbert/modeling_convbert.py +88 -87
  264. transformers/models/convbert/tokenization_convbert.py +0 -1
  265. transformers/models/convnext/configuration_convnext.py +0 -1
  266. transformers/models/convnext/image_processing_convnext.py +20 -23
  267. transformers/models/convnext/image_processing_convnext_fast.py +14 -19
  268. transformers/models/convnext/modeling_convnext.py +5 -8
  269. transformers/models/convnextv2/configuration_convnextv2.py +0 -1
  270. transformers/models/convnextv2/modeling_convnextv2.py +5 -8
  271. transformers/models/cpm/tokenization_cpm.py +6 -7
  272. transformers/models/cpm/tokenization_cpm_fast.py +3 -5
  273. transformers/models/cpmant/configuration_cpmant.py +0 -1
  274. transformers/models/cpmant/modeling_cpmant.py +38 -40
  275. transformers/models/cpmant/tokenization_cpmant.py +1 -3
  276. transformers/models/csm/configuration_csm.py +49 -51
  277. transformers/models/csm/generation_csm.py +31 -35
  278. transformers/models/csm/modeling_csm.py +81 -82
  279. transformers/models/csm/modular_csm.py +58 -58
  280. transformers/models/csm/processing_csm.py +25 -68
  281. transformers/models/ctrl/configuration_ctrl.py +0 -1
  282. transformers/models/ctrl/modeling_ctrl.py +52 -43
  283. transformers/models/ctrl/tokenization_ctrl.py +0 -1
  284. transformers/models/cvt/configuration_cvt.py +0 -1
  285. transformers/models/cvt/modeling_cvt.py +18 -16
  286. transformers/models/cwm/__init__.py +0 -1
  287. transformers/models/cwm/configuration_cwm.py +3 -5
  288. transformers/models/cwm/modeling_cwm.py +33 -35
  289. transformers/models/cwm/modular_cwm.py +10 -12
  290. transformers/models/d_fine/configuration_d_fine.py +3 -5
  291. transformers/models/d_fine/modeling_d_fine.py +127 -121
  292. transformers/models/d_fine/modular_d_fine.py +23 -13
  293. transformers/models/dab_detr/configuration_dab_detr.py +2 -3
  294. transformers/models/dab_detr/modeling_dab_detr.py +69 -71
  295. transformers/models/dac/configuration_dac.py +0 -1
  296. transformers/models/dac/feature_extraction_dac.py +6 -9
  297. transformers/models/dac/modeling_dac.py +21 -23
  298. transformers/models/data2vec/configuration_data2vec_audio.py +0 -1
  299. transformers/models/data2vec/configuration_data2vec_text.py +0 -1
  300. transformers/models/data2vec/configuration_data2vec_vision.py +0 -1
  301. transformers/models/data2vec/modeling_data2vec_audio.py +52 -56
  302. transformers/models/data2vec/modeling_data2vec_text.py +98 -93
  303. transformers/models/data2vec/modeling_data2vec_vision.py +41 -42
  304. transformers/models/data2vec/modular_data2vec_audio.py +6 -1
  305. transformers/models/data2vec/modular_data2vec_text.py +58 -54
  306. transformers/models/dbrx/configuration_dbrx.py +27 -20
  307. transformers/models/dbrx/modeling_dbrx.py +40 -43
  308. transformers/models/dbrx/modular_dbrx.py +31 -33
  309. transformers/models/deberta/configuration_deberta.py +0 -1
  310. transformers/models/deberta/modeling_deberta.py +59 -60
  311. transformers/models/deberta/tokenization_deberta.py +2 -5
  312. transformers/models/deberta_v2/configuration_deberta_v2.py +0 -1
  313. transformers/models/deberta_v2/modeling_deberta_v2.py +65 -65
  314. transformers/models/deberta_v2/tokenization_deberta_v2.py +1 -4
  315. transformers/models/decision_transformer/configuration_decision_transformer.py +0 -1
  316. transformers/models/decision_transformer/modeling_decision_transformer.py +56 -55
  317. transformers/models/deepseek_v2/configuration_deepseek_v2.py +34 -37
  318. transformers/models/deepseek_v2/modeling_deepseek_v2.py +39 -37
  319. transformers/models/deepseek_v2/modular_deepseek_v2.py +44 -44
  320. transformers/models/deepseek_v3/configuration_deepseek_v3.py +35 -38
  321. transformers/models/deepseek_v3/modeling_deepseek_v3.py +40 -38
  322. transformers/models/deepseek_v3/modular_deepseek_v3.py +10 -7
  323. transformers/models/deepseek_vl/configuration_deepseek_vl.py +2 -3
  324. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +25 -26
  325. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +7 -7
  326. transformers/models/deepseek_vl/modeling_deepseek_vl.py +40 -36
  327. transformers/models/deepseek_vl/modular_deepseek_vl.py +14 -43
  328. transformers/models/deepseek_vl/processing_deepseek_vl.py +10 -41
  329. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +3 -5
  330. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +35 -35
  331. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +16 -20
  332. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +42 -38
  333. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +80 -99
  334. transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +12 -44
  335. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -3
  336. transformers/models/deformable_detr/image_processing_deformable_detr.py +59 -61
  337. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +17 -17
  338. transformers/models/deformable_detr/modeling_deformable_detr.py +67 -68
  339. transformers/models/deformable_detr/modular_deformable_detr.py +1 -3
  340. transformers/models/deit/configuration_deit.py +0 -1
  341. transformers/models/deit/image_processing_deit.py +18 -21
  342. transformers/models/deit/image_processing_deit_fast.py +0 -1
  343. transformers/models/deit/modeling_deit.py +16 -18
  344. transformers/models/depth_anything/configuration_depth_anything.py +2 -4
  345. transformers/models/depth_anything/modeling_depth_anything.py +5 -8
  346. transformers/models/depth_pro/configuration_depth_pro.py +0 -1
  347. transformers/models/depth_pro/image_processing_depth_pro.py +22 -23
  348. transformers/models/depth_pro/image_processing_depth_pro_fast.py +6 -8
  349. transformers/models/depth_pro/modeling_depth_pro.py +21 -23
  350. transformers/models/detr/configuration_detr.py +1 -2
  351. transformers/models/detr/image_processing_detr.py +64 -66
  352. transformers/models/detr/image_processing_detr_fast.py +22 -23
  353. transformers/models/detr/modeling_detr.py +78 -73
  354. transformers/models/dia/configuration_dia.py +5 -8
  355. transformers/models/dia/feature_extraction_dia.py +6 -9
  356. transformers/models/dia/generation_dia.py +42 -45
  357. transformers/models/dia/modeling_dia.py +73 -65
  358. transformers/models/dia/modular_dia.py +63 -54
  359. transformers/models/dia/processing_dia.py +39 -29
  360. transformers/models/dia/tokenization_dia.py +3 -6
  361. transformers/models/diffllama/configuration_diffllama.py +20 -23
  362. transformers/models/diffllama/modeling_diffllama.py +44 -47
  363. transformers/models/diffllama/modular_diffllama.py +17 -19
  364. transformers/models/dinat/configuration_dinat.py +0 -1
  365. transformers/models/dinat/modeling_dinat.py +40 -42
  366. transformers/models/dinov2/configuration_dinov2.py +0 -1
  367. transformers/models/dinov2/modeling_dinov2.py +11 -13
  368. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +1 -1
  369. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +12 -13
  370. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +5 -7
  371. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +4 -7
  372. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +3 -6
  373. transformers/models/dinov3_vit/configuration_dinov3_vit.py +5 -8
  374. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +5 -7
  375. transformers/models/dinov3_vit/modeling_dinov3_vit.py +17 -16
  376. transformers/models/dinov3_vit/modular_dinov3_vit.py +14 -13
  377. transformers/models/distilbert/configuration_distilbert.py +0 -1
  378. transformers/models/distilbert/modeling_distilbert.py +55 -55
  379. transformers/models/distilbert/tokenization_distilbert.py +0 -1
  380. transformers/models/doge/__init__.py +0 -1
  381. transformers/models/doge/configuration_doge.py +25 -28
  382. transformers/models/doge/modeling_doge.py +43 -46
  383. transformers/models/doge/modular_doge.py +57 -58
  384. transformers/models/donut/configuration_donut_swin.py +0 -1
  385. transformers/models/donut/image_processing_donut.py +26 -29
  386. transformers/models/donut/image_processing_donut_fast.py +5 -11
  387. transformers/models/donut/modeling_donut_swin.py +60 -58
  388. transformers/models/donut/processing_donut.py +5 -26
  389. transformers/models/dots1/configuration_dots1.py +27 -29
  390. transformers/models/dots1/modeling_dots1.py +45 -39
  391. transformers/models/dots1/modular_dots1.py +0 -1
  392. transformers/models/dpr/configuration_dpr.py +0 -1
  393. transformers/models/dpr/modeling_dpr.py +37 -39
  394. transformers/models/dpr/tokenization_dpr.py +7 -9
  395. transformers/models/dpr/tokenization_dpr_fast.py +7 -9
  396. transformers/models/dpt/configuration_dpt.py +1 -2
  397. transformers/models/dpt/image_processing_dpt.py +65 -66
  398. transformers/models/dpt/image_processing_dpt_fast.py +14 -16
  399. transformers/models/dpt/modeling_dpt.py +19 -21
  400. transformers/models/dpt/modular_dpt.py +11 -13
  401. transformers/models/edgetam/configuration_edgetam.py +1 -2
  402. transformers/models/edgetam/modeling_edgetam.py +44 -43
  403. transformers/models/edgetam/modular_edgetam.py +17 -20
  404. transformers/models/edgetam_video/__init__.py +0 -1
  405. transformers/models/edgetam_video/configuration_edgetam_video.py +0 -1
  406. transformers/models/edgetam_video/modeling_edgetam_video.py +131 -120
  407. transformers/models/edgetam_video/modular_edgetam_video.py +29 -37
  408. transformers/models/efficientloftr/configuration_efficientloftr.py +4 -5
  409. transformers/models/efficientloftr/image_processing_efficientloftr.py +14 -16
  410. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +5 -6
  411. transformers/models/efficientloftr/modeling_efficientloftr.py +41 -30
  412. transformers/models/efficientloftr/modular_efficientloftr.py +1 -3
  413. transformers/models/efficientnet/configuration_efficientnet.py +0 -1
  414. transformers/models/efficientnet/image_processing_efficientnet.py +28 -32
  415. transformers/models/efficientnet/image_processing_efficientnet_fast.py +15 -17
  416. transformers/models/efficientnet/modeling_efficientnet.py +17 -15
  417. transformers/models/electra/configuration_electra.py +0 -1
  418. transformers/models/electra/modeling_electra.py +108 -103
  419. transformers/models/emu3/configuration_emu3.py +5 -7
  420. transformers/models/emu3/image_processing_emu3.py +44 -39
  421. transformers/models/emu3/modeling_emu3.py +67 -64
  422. transformers/models/emu3/modular_emu3.py +39 -35
  423. transformers/models/emu3/processing_emu3.py +18 -43
  424. transformers/models/encodec/configuration_encodec.py +2 -4
  425. transformers/models/encodec/feature_extraction_encodec.py +10 -13
  426. transformers/models/encodec/modeling_encodec.py +39 -29
  427. transformers/models/encoder_decoder/configuration_encoder_decoder.py +0 -1
  428. transformers/models/encoder_decoder/modeling_encoder_decoder.py +17 -19
  429. transformers/models/eomt/configuration_eomt.py +0 -1
  430. transformers/models/eomt/image_processing_eomt.py +53 -55
  431. transformers/models/eomt/image_processing_eomt_fast.py +59 -28
  432. transformers/models/eomt/modeling_eomt.py +23 -18
  433. transformers/models/eomt/modular_eomt.py +18 -13
  434. transformers/models/ernie/configuration_ernie.py +0 -1
  435. transformers/models/ernie/modeling_ernie.py +127 -132
  436. transformers/models/ernie/modular_ernie.py +97 -103
  437. transformers/models/ernie4_5/configuration_ernie4_5.py +18 -20
  438. transformers/models/ernie4_5/modeling_ernie4_5.py +32 -34
  439. transformers/models/ernie4_5/modular_ernie4_5.py +1 -3
  440. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +27 -29
  441. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +52 -51
  442. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +16 -44
  443. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  444. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +329 -0
  445. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +455 -0
  446. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +231 -0
  447. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1895 -0
  448. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1901 -0
  449. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +249 -0
  450. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +593 -0
  451. transformers/models/esm/configuration_esm.py +2 -4
  452. transformers/models/esm/modeling_esm.py +38 -34
  453. transformers/models/esm/modeling_esmfold.py +48 -45
  454. transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
  455. transformers/models/esm/openfold_utils/loss.py +1 -2
  456. transformers/models/esm/openfold_utils/protein.py +13 -13
  457. transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
  458. transformers/models/esm/tokenization_esm.py +2 -4
  459. transformers/models/evolla/configuration_evolla.py +29 -32
  460. transformers/models/evolla/modeling_evolla.py +67 -62
  461. transformers/models/evolla/modular_evolla.py +53 -47
  462. transformers/models/evolla/processing_evolla.py +23 -35
  463. transformers/models/exaone4/configuration_exaone4.py +19 -22
  464. transformers/models/exaone4/modeling_exaone4.py +33 -36
  465. transformers/models/exaone4/modular_exaone4.py +40 -42
  466. transformers/models/falcon/configuration_falcon.py +22 -25
  467. transformers/models/falcon/modeling_falcon.py +75 -78
  468. transformers/models/falcon_h1/configuration_falcon_h1.py +40 -43
  469. transformers/models/falcon_h1/modeling_falcon_h1.py +80 -78
  470. transformers/models/falcon_h1/modular_falcon_h1.py +54 -50
  471. transformers/models/falcon_mamba/configuration_falcon_mamba.py +0 -1
  472. transformers/models/falcon_mamba/modeling_falcon_mamba.py +50 -47
  473. transformers/models/falcon_mamba/modular_falcon_mamba.py +16 -14
  474. transformers/models/fast_vlm/configuration_fast_vlm.py +1 -0
  475. transformers/models/fast_vlm/modeling_fast_vlm.py +43 -39
  476. transformers/models/fast_vlm/modular_fast_vlm.py +2 -3
  477. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +2 -5
  478. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +68 -57
  479. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +2 -3
  480. transformers/models/flaubert/configuration_flaubert.py +0 -1
  481. transformers/models/flaubert/modeling_flaubert.py +138 -143
  482. transformers/models/flaubert/tokenization_flaubert.py +3 -5
  483. transformers/models/flava/configuration_flava.py +5 -6
  484. transformers/models/flava/image_processing_flava.py +66 -67
  485. transformers/models/flava/image_processing_flava_fast.py +42 -45
  486. transformers/models/flava/modeling_flava.py +111 -107
  487. transformers/models/flava/processing_flava.py +2 -12
  488. transformers/models/flex_olmo/__init__.py +0 -1
  489. transformers/models/flex_olmo/configuration_flex_olmo.py +23 -25
  490. transformers/models/flex_olmo/modeling_flex_olmo.py +44 -43
  491. transformers/models/flex_olmo/modular_flex_olmo.py +35 -37
  492. transformers/models/florence2/configuration_florence2.py +0 -1
  493. transformers/models/florence2/modeling_florence2.py +59 -43
  494. transformers/models/florence2/modular_florence2.py +65 -81
  495. transformers/models/florence2/processing_florence2.py +18 -47
  496. transformers/models/fnet/configuration_fnet.py +0 -1
  497. transformers/models/fnet/modeling_fnet.py +76 -80
  498. transformers/models/fnet/tokenization_fnet.py +0 -1
  499. transformers/models/focalnet/configuration_focalnet.py +0 -1
  500. transformers/models/focalnet/modeling_focalnet.py +39 -41
  501. transformers/models/fsmt/configuration_fsmt.py +0 -1
  502. transformers/models/fsmt/modeling_fsmt.py +47 -48
  503. transformers/models/fsmt/tokenization_fsmt.py +3 -5
  504. transformers/models/funnel/configuration_funnel.py +0 -1
  505. transformers/models/funnel/modeling_funnel.py +91 -93
  506. transformers/models/funnel/tokenization_funnel.py +2 -5
  507. transformers/models/fuyu/configuration_fuyu.py +23 -26
  508. transformers/models/fuyu/image_processing_fuyu.py +29 -31
  509. transformers/models/fuyu/image_processing_fuyu_fast.py +12 -13
  510. transformers/models/fuyu/modeling_fuyu.py +29 -30
  511. transformers/models/fuyu/processing_fuyu.py +23 -34
  512. transformers/models/gemma/configuration_gemma.py +20 -23
  513. transformers/models/gemma/modeling_gemma.py +42 -46
  514. transformers/models/gemma/modular_gemma.py +37 -40
  515. transformers/models/gemma/tokenization_gemma.py +3 -6
  516. transformers/models/gemma2/configuration_gemma2.py +25 -28
  517. transformers/models/gemma2/modeling_gemma2.py +35 -38
  518. transformers/models/gemma2/modular_gemma2.py +56 -58
  519. transformers/models/gemma3/configuration_gemma3.py +28 -29
  520. transformers/models/gemma3/image_processing_gemma3.py +29 -31
  521. transformers/models/gemma3/image_processing_gemma3_fast.py +9 -11
  522. transformers/models/gemma3/modeling_gemma3.py +112 -94
  523. transformers/models/gemma3/modular_gemma3.py +110 -91
  524. transformers/models/gemma3/processing_gemma3.py +5 -5
  525. transformers/models/gemma3n/configuration_gemma3n.py +12 -10
  526. transformers/models/gemma3n/feature_extraction_gemma3n.py +9 -11
  527. transformers/models/gemma3n/modeling_gemma3n.py +127 -98
  528. transformers/models/gemma3n/modular_gemma3n.py +117 -84
  529. transformers/models/gemma3n/processing_gemma3n.py +12 -26
  530. transformers/models/git/configuration_git.py +0 -1
  531. transformers/models/git/modeling_git.py +250 -197
  532. transformers/models/git/processing_git.py +2 -14
  533. transformers/models/glm/configuration_glm.py +19 -21
  534. transformers/models/glm/modeling_glm.py +33 -36
  535. transformers/models/glm/modular_glm.py +4 -7
  536. transformers/models/glm4/configuration_glm4.py +19 -21
  537. transformers/models/glm4/modeling_glm4.py +36 -38
  538. transformers/models/glm4/modular_glm4.py +8 -10
  539. transformers/models/glm46v/configuration_glm46v.py +0 -1
  540. transformers/models/glm46v/image_processing_glm46v.py +35 -40
  541. transformers/models/glm46v/image_processing_glm46v_fast.py +7 -7
  542. transformers/models/glm46v/modeling_glm46v.py +54 -52
  543. transformers/models/glm46v/modular_glm46v.py +4 -3
  544. transformers/models/glm46v/processing_glm46v.py +7 -41
  545. transformers/models/glm46v/video_processing_glm46v.py +9 -11
  546. transformers/models/glm4_moe/configuration_glm4_moe.py +25 -28
  547. transformers/models/glm4_moe/modeling_glm4_moe.py +41 -40
  548. transformers/models/glm4_moe/modular_glm4_moe.py +27 -30
  549. transformers/models/glm4_moe_lite/__init__.py +28 -0
  550. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +235 -0
  551. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +740 -0
  552. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +304 -0
  553. transformers/models/glm4v/configuration_glm4v.py +14 -17
  554. transformers/models/glm4v/image_processing_glm4v.py +34 -40
  555. transformers/models/glm4v/image_processing_glm4v_fast.py +6 -7
  556. transformers/models/glm4v/modeling_glm4v.py +148 -156
  557. transformers/models/glm4v/modular_glm4v.py +142 -185
  558. transformers/models/glm4v/processing_glm4v.py +7 -41
  559. transformers/models/glm4v/video_processing_glm4v.py +9 -11
  560. transformers/models/glm4v_moe/configuration_glm4v_moe.py +119 -122
  561. transformers/models/glm4v_moe/modeling_glm4v_moe.py +275 -319
  562. transformers/models/glm4v_moe/modular_glm4v_moe.py +66 -163
  563. transformers/models/glm_image/__init__.py +31 -0
  564. transformers/models/glm_image/configuration_glm_image.py +352 -0
  565. transformers/models/glm_image/image_processing_glm_image.py +503 -0
  566. transformers/models/glm_image/image_processing_glm_image_fast.py +296 -0
  567. transformers/models/glm_image/modeling_glm_image.py +1590 -0
  568. transformers/models/glm_image/modular_glm_image.py +1480 -0
  569. transformers/models/glm_image/processing_glm_image.py +217 -0
  570. transformers/models/glmasr/__init__.py +29 -0
  571. transformers/models/glmasr/configuration_glmasr.py +196 -0
  572. transformers/models/glmasr/modeling_glmasr.py +511 -0
  573. transformers/models/glmasr/modular_glmasr.py +431 -0
  574. transformers/models/glmasr/processing_glmasr.py +331 -0
  575. transformers/models/glpn/configuration_glpn.py +0 -1
  576. transformers/models/glpn/image_processing_glpn.py +11 -12
  577. transformers/models/glpn/image_processing_glpn_fast.py +8 -10
  578. transformers/models/glpn/modeling_glpn.py +10 -12
  579. transformers/models/got_ocr2/configuration_got_ocr2.py +5 -8
  580. transformers/models/got_ocr2/image_processing_got_ocr2.py +22 -24
  581. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +6 -8
  582. transformers/models/got_ocr2/modeling_got_ocr2.py +48 -45
  583. transformers/models/got_ocr2/modular_got_ocr2.py +31 -34
  584. transformers/models/got_ocr2/processing_got_ocr2.py +42 -63
  585. transformers/models/gpt2/configuration_gpt2.py +0 -1
  586. transformers/models/gpt2/modeling_gpt2.py +114 -113
  587. transformers/models/gpt2/tokenization_gpt2.py +6 -9
  588. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +0 -1
  589. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +76 -88
  590. transformers/models/gpt_neo/configuration_gpt_neo.py +0 -1
  591. transformers/models/gpt_neo/modeling_gpt_neo.py +77 -66
  592. transformers/models/gpt_neox/configuration_gpt_neox.py +19 -22
  593. transformers/models/gpt_neox/modeling_gpt_neox.py +71 -73
  594. transformers/models/gpt_neox/modular_gpt_neox.py +64 -66
  595. transformers/models/gpt_neox/tokenization_gpt_neox.py +2 -5
  596. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +15 -18
  597. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +42 -45
  598. transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +1 -3
  599. transformers/models/gpt_oss/configuration_gpt_oss.py +38 -24
  600. transformers/models/gpt_oss/modeling_gpt_oss.py +40 -44
  601. transformers/models/gpt_oss/modular_gpt_oss.py +22 -26
  602. transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
  603. transformers/models/gptj/configuration_gptj.py +0 -1
  604. transformers/models/gptj/modeling_gptj.py +96 -86
  605. transformers/models/granite/configuration_granite.py +23 -26
  606. transformers/models/granite/modeling_granite.py +40 -42
  607. transformers/models/granite/modular_granite.py +29 -31
  608. transformers/models/granite_speech/configuration_granite_speech.py +0 -1
  609. transformers/models/granite_speech/feature_extraction_granite_speech.py +1 -3
  610. transformers/models/granite_speech/modeling_granite_speech.py +36 -24
  611. transformers/models/granite_speech/processing_granite_speech.py +11 -4
  612. transformers/models/granitemoe/configuration_granitemoe.py +26 -29
  613. transformers/models/granitemoe/modeling_granitemoe.py +37 -40
  614. transformers/models/granitemoe/modular_granitemoe.py +22 -25
  615. transformers/models/granitemoehybrid/__init__.py +0 -1
  616. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +41 -40
  617. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +92 -86
  618. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +29 -21
  619. transformers/models/granitemoeshared/configuration_granitemoeshared.py +27 -30
  620. transformers/models/granitemoeshared/modeling_granitemoeshared.py +50 -55
  621. transformers/models/granitemoeshared/modular_granitemoeshared.py +19 -21
  622. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -4
  623. transformers/models/grounding_dino/image_processing_grounding_dino.py +60 -62
  624. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +17 -18
  625. transformers/models/grounding_dino/modeling_grounding_dino.py +95 -97
  626. transformers/models/grounding_dino/modular_grounding_dino.py +2 -3
  627. transformers/models/grounding_dino/processing_grounding_dino.py +10 -38
  628. transformers/models/groupvit/configuration_groupvit.py +0 -1
  629. transformers/models/groupvit/modeling_groupvit.py +75 -71
  630. transformers/models/helium/configuration_helium.py +20 -22
  631. transformers/models/helium/modeling_helium.py +34 -37
  632. transformers/models/helium/modular_helium.py +3 -7
  633. transformers/models/herbert/tokenization_herbert.py +4 -6
  634. transformers/models/hgnet_v2/configuration_hgnet_v2.py +0 -1
  635. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -9
  636. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -9
  637. transformers/models/hiera/configuration_hiera.py +0 -1
  638. transformers/models/hiera/modeling_hiera.py +60 -62
  639. transformers/models/hubert/configuration_hubert.py +0 -1
  640. transformers/models/hubert/modeling_hubert.py +39 -37
  641. transformers/models/hubert/modular_hubert.py +12 -11
  642. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +21 -24
  643. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +31 -34
  644. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +4 -6
  645. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  646. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +25 -28
  647. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +44 -39
  648. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +9 -9
  649. transformers/models/ibert/configuration_ibert.py +0 -1
  650. transformers/models/ibert/modeling_ibert.py +76 -62
  651. transformers/models/ibert/quant_modules.py +0 -1
  652. transformers/models/idefics/configuration_idefics.py +0 -1
  653. transformers/models/idefics/image_processing_idefics.py +13 -15
  654. transformers/models/idefics/modeling_idefics.py +70 -61
  655. transformers/models/idefics/perceiver.py +1 -3
  656. transformers/models/idefics/processing_idefics.py +32 -48
  657. transformers/models/idefics/vision.py +22 -24
  658. transformers/models/idefics2/configuration_idefics2.py +0 -1
  659. transformers/models/idefics2/image_processing_idefics2.py +31 -32
  660. transformers/models/idefics2/image_processing_idefics2_fast.py +7 -8
  661. transformers/models/idefics2/modeling_idefics2.py +63 -59
  662. transformers/models/idefics2/processing_idefics2.py +10 -68
  663. transformers/models/idefics3/configuration_idefics3.py +0 -1
  664. transformers/models/idefics3/image_processing_idefics3.py +42 -43
  665. transformers/models/idefics3/image_processing_idefics3_fast.py +11 -12
  666. transformers/models/idefics3/modeling_idefics3.py +57 -55
  667. transformers/models/idefics3/processing_idefics3.py +15 -69
  668. transformers/models/ijepa/configuration_ijepa.py +0 -1
  669. transformers/models/ijepa/modeling_ijepa.py +10 -11
  670. transformers/models/ijepa/modular_ijepa.py +5 -7
  671. transformers/models/imagegpt/configuration_imagegpt.py +0 -1
  672. transformers/models/imagegpt/image_processing_imagegpt.py +17 -18
  673. transformers/models/imagegpt/image_processing_imagegpt_fast.py +9 -14
  674. transformers/models/imagegpt/modeling_imagegpt.py +66 -60
  675. transformers/models/informer/configuration_informer.py +6 -9
  676. transformers/models/informer/modeling_informer.py +84 -86
  677. transformers/models/informer/modular_informer.py +13 -16
  678. transformers/models/instructblip/configuration_instructblip.py +0 -1
  679. transformers/models/instructblip/modeling_instructblip.py +45 -44
  680. transformers/models/instructblip/processing_instructblip.py +10 -36
  681. transformers/models/instructblipvideo/configuration_instructblipvideo.py +0 -1
  682. transformers/models/instructblipvideo/modeling_instructblipvideo.py +107 -105
  683. transformers/models/instructblipvideo/modular_instructblipvideo.py +34 -36
  684. transformers/models/instructblipvideo/processing_instructblipvideo.py +14 -33
  685. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +4 -6
  686. transformers/models/internvl/configuration_internvl.py +0 -1
  687. transformers/models/internvl/modeling_internvl.py +52 -51
  688. transformers/models/internvl/modular_internvl.py +24 -30
  689. transformers/models/internvl/processing_internvl.py +12 -45
  690. transformers/models/internvl/video_processing_internvl.py +8 -10
  691. transformers/models/jais2/__init__.py +27 -0
  692. transformers/models/jais2/configuration_jais2.py +150 -0
  693. transformers/models/jais2/modeling_jais2.py +484 -0
  694. transformers/models/jais2/modular_jais2.py +194 -0
  695. transformers/models/jamba/configuration_jamba.py +0 -1
  696. transformers/models/jamba/modeling_jamba.py +67 -65
  697. transformers/models/jamba/modular_jamba.py +54 -55
  698. transformers/models/janus/configuration_janus.py +0 -1
  699. transformers/models/janus/image_processing_janus.py +35 -37
  700. transformers/models/janus/image_processing_janus_fast.py +12 -14
  701. transformers/models/janus/modeling_janus.py +56 -50
  702. transformers/models/janus/modular_janus.py +76 -70
  703. transformers/models/janus/processing_janus.py +17 -43
  704. transformers/models/jetmoe/configuration_jetmoe.py +20 -23
  705. transformers/models/jetmoe/modeling_jetmoe.py +41 -44
  706. transformers/models/jetmoe/modular_jetmoe.py +31 -33
  707. transformers/models/kosmos2/configuration_kosmos2.py +0 -1
  708. transformers/models/kosmos2/modeling_kosmos2.py +159 -148
  709. transformers/models/kosmos2/processing_kosmos2.py +40 -55
  710. transformers/models/kosmos2_5/__init__.py +0 -1
  711. transformers/models/kosmos2_5/configuration_kosmos2_5.py +0 -1
  712. transformers/models/kosmos2_5/image_processing_kosmos2_5.py +10 -12
  713. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +4 -13
  714. transformers/models/kosmos2_5/modeling_kosmos2_5.py +118 -110
  715. transformers/models/kosmos2_5/processing_kosmos2_5.py +8 -29
  716. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +23 -25
  717. transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +12 -14
  718. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +67 -68
  719. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +28 -22
  720. transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +2 -8
  721. transformers/models/lasr/configuration_lasr.py +5 -3
  722. transformers/models/lasr/feature_extraction_lasr.py +10 -12
  723. transformers/models/lasr/modeling_lasr.py +21 -23
  724. transformers/models/lasr/modular_lasr.py +16 -11
  725. transformers/models/lasr/processing_lasr.py +12 -8
  726. transformers/models/lasr/tokenization_lasr.py +2 -4
  727. transformers/models/layoutlm/configuration_layoutlm.py +0 -1
  728. transformers/models/layoutlm/modeling_layoutlm.py +72 -72
  729. transformers/models/layoutlmv2/configuration_layoutlmv2.py +0 -1
  730. transformers/models/layoutlmv2/image_processing_layoutlmv2.py +18 -21
  731. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +5 -7
  732. transformers/models/layoutlmv2/modeling_layoutlmv2.py +60 -50
  733. transformers/models/layoutlmv2/processing_layoutlmv2.py +14 -44
  734. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +64 -74
  735. transformers/models/layoutlmv3/configuration_layoutlmv3.py +0 -1
  736. transformers/models/layoutlmv3/image_processing_layoutlmv3.py +24 -26
  737. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +7 -9
  738. transformers/models/layoutlmv3/modeling_layoutlmv3.py +78 -56
  739. transformers/models/layoutlmv3/processing_layoutlmv3.py +14 -46
  740. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +64 -75
  741. transformers/models/layoutxlm/configuration_layoutxlm.py +0 -1
  742. transformers/models/layoutxlm/modular_layoutxlm.py +0 -1
  743. transformers/models/layoutxlm/processing_layoutxlm.py +14 -44
  744. transformers/models/layoutxlm/tokenization_layoutxlm.py +65 -76
  745. transformers/models/led/configuration_led.py +1 -4
  746. transformers/models/led/modeling_led.py +119 -267
  747. transformers/models/levit/configuration_levit.py +0 -1
  748. transformers/models/levit/image_processing_levit.py +19 -21
  749. transformers/models/levit/image_processing_levit_fast.py +0 -1
  750. transformers/models/levit/modeling_levit.py +35 -19
  751. transformers/models/lfm2/configuration_lfm2.py +22 -23
  752. transformers/models/lfm2/modeling_lfm2.py +43 -45
  753. transformers/models/lfm2/modular_lfm2.py +29 -29
  754. transformers/models/lfm2_moe/__init__.py +0 -1
  755. transformers/models/lfm2_moe/configuration_lfm2_moe.py +1 -2
  756. transformers/models/lfm2_moe/modeling_lfm2_moe.py +58 -49
  757. transformers/models/lfm2_moe/modular_lfm2_moe.py +13 -37
  758. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -1
  759. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +34 -5
  760. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -38
  761. transformers/models/lfm2_vl/modular_lfm2_vl.py +28 -29
  762. transformers/models/lfm2_vl/processing_lfm2_vl.py +96 -76
  763. transformers/models/lightglue/image_processing_lightglue.py +16 -15
  764. transformers/models/lightglue/image_processing_lightglue_fast.py +5 -6
  765. transformers/models/lightglue/modeling_lightglue.py +28 -30
  766. transformers/models/lightglue/modular_lightglue.py +28 -28
  767. transformers/models/lighton_ocr/__init__.py +28 -0
  768. transformers/models/lighton_ocr/configuration_lighton_ocr.py +128 -0
  769. transformers/models/lighton_ocr/modeling_lighton_ocr.py +460 -0
  770. transformers/models/lighton_ocr/modular_lighton_ocr.py +403 -0
  771. transformers/models/lighton_ocr/processing_lighton_ocr.py +229 -0
  772. transformers/models/lilt/configuration_lilt.py +0 -1
  773. transformers/models/lilt/modeling_lilt.py +72 -70
  774. transformers/models/llama/configuration_llama.py +21 -24
  775. transformers/models/llama/modeling_llama.py +32 -35
  776. transformers/models/llama/tokenization_llama.py +2 -4
  777. transformers/models/llama4/configuration_llama4.py +20 -22
  778. transformers/models/llama4/image_processing_llama4_fast.py +9 -11
  779. transformers/models/llama4/modeling_llama4.py +78 -75
  780. transformers/models/llama4/processing_llama4.py +33 -57
  781. transformers/models/llava/configuration_llava.py +0 -1
  782. transformers/models/llava/image_processing_llava.py +25 -28
  783. transformers/models/llava/image_processing_llava_fast.py +6 -8
  784. transformers/models/llava/modeling_llava.py +47 -44
  785. transformers/models/llava/processing_llava.py +18 -51
  786. transformers/models/llava_next/configuration_llava_next.py +0 -1
  787. transformers/models/llava_next/image_processing_llava_next.py +43 -45
  788. transformers/models/llava_next/image_processing_llava_next_fast.py +5 -7
  789. transformers/models/llava_next/modeling_llava_next.py +49 -47
  790. transformers/models/llava_next/processing_llava_next.py +18 -47
  791. transformers/models/llava_next_video/configuration_llava_next_video.py +0 -1
  792. transformers/models/llava_next_video/modeling_llava_next_video.py +60 -58
  793. transformers/models/llava_next_video/modular_llava_next_video.py +51 -49
  794. transformers/models/llava_next_video/processing_llava_next_video.py +21 -63
  795. transformers/models/llava_next_video/video_processing_llava_next_video.py +0 -1
  796. transformers/models/llava_onevision/configuration_llava_onevision.py +0 -1
  797. transformers/models/llava_onevision/image_processing_llava_onevision.py +40 -42
  798. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +6 -8
  799. transformers/models/llava_onevision/modeling_llava_onevision.py +67 -65
  800. transformers/models/llava_onevision/modular_llava_onevision.py +58 -56
  801. transformers/models/llava_onevision/processing_llava_onevision.py +21 -53
  802. transformers/models/llava_onevision/video_processing_llava_onevision.py +0 -1
  803. transformers/models/longcat_flash/__init__.py +0 -1
  804. transformers/models/longcat_flash/configuration_longcat_flash.py +32 -35
  805. transformers/models/longcat_flash/modeling_longcat_flash.py +32 -32
  806. transformers/models/longcat_flash/modular_longcat_flash.py +18 -19
  807. transformers/models/longformer/configuration_longformer.py +1 -4
  808. transformers/models/longformer/modeling_longformer.py +99 -101
  809. transformers/models/longt5/configuration_longt5.py +0 -1
  810. transformers/models/longt5/modeling_longt5.py +43 -48
  811. transformers/models/luke/configuration_luke.py +0 -1
  812. transformers/models/luke/modeling_luke.py +179 -181
  813. transformers/models/luke/tokenization_luke.py +99 -105
  814. transformers/models/lw_detr/__init__.py +27 -0
  815. transformers/models/lw_detr/configuration_lw_detr.py +374 -0
  816. transformers/models/lw_detr/modeling_lw_detr.py +1698 -0
  817. transformers/models/lw_detr/modular_lw_detr.py +1611 -0
  818. transformers/models/lxmert/configuration_lxmert.py +0 -1
  819. transformers/models/lxmert/modeling_lxmert.py +63 -74
  820. transformers/models/m2m_100/configuration_m2m_100.py +0 -1
  821. transformers/models/m2m_100/modeling_m2m_100.py +79 -71
  822. transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
  823. transformers/models/mamba/configuration_mamba.py +0 -1
  824. transformers/models/mamba/modeling_mamba.py +44 -44
  825. transformers/models/mamba2/configuration_mamba2.py +0 -1
  826. transformers/models/mamba2/modeling_mamba2.py +67 -68
  827. transformers/models/marian/configuration_marian.py +1 -2
  828. transformers/models/marian/modeling_marian.py +87 -86
  829. transformers/models/marian/tokenization_marian.py +6 -6
  830. transformers/models/markuplm/configuration_markuplm.py +0 -1
  831. transformers/models/markuplm/feature_extraction_markuplm.py +1 -2
  832. transformers/models/markuplm/modeling_markuplm.py +65 -70
  833. transformers/models/markuplm/processing_markuplm.py +31 -38
  834. transformers/models/markuplm/tokenization_markuplm.py +67 -77
  835. transformers/models/mask2former/configuration_mask2former.py +5 -8
  836. transformers/models/mask2former/image_processing_mask2former.py +84 -85
  837. transformers/models/mask2former/image_processing_mask2former_fast.py +30 -33
  838. transformers/models/mask2former/modeling_mask2former.py +99 -92
  839. transformers/models/mask2former/modular_mask2former.py +6 -8
  840. transformers/models/maskformer/configuration_maskformer.py +6 -9
  841. transformers/models/maskformer/configuration_maskformer_swin.py +0 -1
  842. transformers/models/maskformer/image_processing_maskformer.py +84 -85
  843. transformers/models/maskformer/image_processing_maskformer_fast.py +29 -33
  844. transformers/models/maskformer/modeling_maskformer.py +65 -59
  845. transformers/models/maskformer/modeling_maskformer_swin.py +34 -32
  846. transformers/models/mbart/configuration_mbart.py +1 -1
  847. transformers/models/mbart/modeling_mbart.py +118 -113
  848. transformers/models/mbart/tokenization_mbart.py +2 -4
  849. transformers/models/mbart50/tokenization_mbart50.py +3 -5
  850. transformers/models/megatron_bert/configuration_megatron_bert.py +0 -1
  851. transformers/models/megatron_bert/modeling_megatron_bert.py +141 -150
  852. transformers/models/metaclip_2/modeling_metaclip_2.py +48 -46
  853. transformers/models/metaclip_2/modular_metaclip_2.py +21 -21
  854. transformers/models/mgp_str/configuration_mgp_str.py +0 -1
  855. transformers/models/mgp_str/modeling_mgp_str.py +14 -16
  856. transformers/models/mgp_str/processing_mgp_str.py +3 -20
  857. transformers/models/mgp_str/tokenization_mgp_str.py +1 -3
  858. transformers/models/mimi/configuration_mimi.py +38 -40
  859. transformers/models/mimi/modeling_mimi.py +100 -82
  860. transformers/models/minimax/__init__.py +0 -1
  861. transformers/models/minimax/configuration_minimax.py +32 -36
  862. transformers/models/minimax/modeling_minimax.py +57 -47
  863. transformers/models/minimax/modular_minimax.py +62 -54
  864. transformers/models/minimax_m2/__init__.py +28 -0
  865. transformers/models/minimax_m2/configuration_minimax_m2.py +211 -0
  866. transformers/models/minimax_m2/modeling_minimax_m2.py +704 -0
  867. transformers/models/minimax_m2/modular_minimax_m2.py +369 -0
  868. transformers/models/ministral/configuration_ministral.py +20 -22
  869. transformers/models/ministral/modeling_ministral.py +32 -34
  870. transformers/models/ministral/modular_ministral.py +27 -29
  871. transformers/models/ministral3/configuration_ministral3.py +19 -22
  872. transformers/models/ministral3/modeling_ministral3.py +32 -34
  873. transformers/models/ministral3/modular_ministral3.py +4 -5
  874. transformers/models/mistral/configuration_mistral.py +19 -22
  875. transformers/models/mistral/modeling_mistral.py +32 -34
  876. transformers/models/mistral/modular_mistral.py +11 -12
  877. transformers/models/mistral3/configuration_mistral3.py +0 -1
  878. transformers/models/mistral3/modeling_mistral3.py +53 -46
  879. transformers/models/mistral3/modular_mistral3.py +38 -36
  880. transformers/models/mixtral/configuration_mixtral.py +24 -27
  881. transformers/models/mixtral/modeling_mixtral.py +47 -42
  882. transformers/models/mixtral/modular_mixtral.py +32 -31
  883. transformers/models/mlcd/configuration_mlcd.py +0 -1
  884. transformers/models/mlcd/modeling_mlcd.py +16 -12
  885. transformers/models/mlcd/modular_mlcd.py +13 -11
  886. transformers/models/mllama/configuration_mllama.py +5 -8
  887. transformers/models/mllama/image_processing_mllama.py +23 -25
  888. transformers/models/mllama/image_processing_mllama_fast.py +5 -6
  889. transformers/models/mllama/modeling_mllama.py +94 -86
  890. transformers/models/mllama/processing_mllama.py +6 -55
  891. transformers/models/mluke/tokenization_mluke.py +97 -103
  892. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -3
  893. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +95 -97
  894. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -3
  895. transformers/models/mobilebert/configuration_mobilebert.py +0 -1
  896. transformers/models/mobilebert/modeling_mobilebert.py +77 -85
  897. transformers/models/mobilebert/tokenization_mobilebert.py +0 -1
  898. transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +0 -1
  899. transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +20 -23
  900. transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +0 -1
  901. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +13 -16
  902. transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +0 -1
  903. transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +48 -51
  904. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +10 -12
  905. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +17 -20
  906. transformers/models/mobilevit/configuration_mobilevit.py +0 -1
  907. transformers/models/mobilevit/image_processing_mobilevit.py +46 -49
  908. transformers/models/mobilevit/image_processing_mobilevit_fast.py +9 -11
  909. transformers/models/mobilevit/modeling_mobilevit.py +21 -19
  910. transformers/models/mobilevitv2/configuration_mobilevitv2.py +0 -1
  911. transformers/models/mobilevitv2/modeling_mobilevitv2.py +21 -20
  912. transformers/models/modernbert/configuration_modernbert.py +34 -34
  913. transformers/models/modernbert/modeling_modernbert.py +135 -126
  914. transformers/models/modernbert/modular_modernbert.py +167 -156
  915. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +30 -32
  916. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +54 -48
  917. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +78 -71
  918. transformers/models/moonshine/configuration_moonshine.py +22 -24
  919. transformers/models/moonshine/modeling_moonshine.py +64 -66
  920. transformers/models/moonshine/modular_moonshine.py +72 -73
  921. transformers/models/moshi/configuration_moshi.py +18 -21
  922. transformers/models/moshi/modeling_moshi.py +150 -183
  923. transformers/models/mpnet/configuration_mpnet.py +0 -1
  924. transformers/models/mpnet/modeling_mpnet.py +57 -57
  925. transformers/models/mpnet/tokenization_mpnet.py +1 -4
  926. transformers/models/mpt/configuration_mpt.py +1 -9
  927. transformers/models/mpt/modeling_mpt.py +58 -60
  928. transformers/models/mra/configuration_mra.py +0 -1
  929. transformers/models/mra/modeling_mra.py +58 -57
  930. transformers/models/mt5/configuration_mt5.py +2 -4
  931. transformers/models/mt5/modeling_mt5.py +75 -87
  932. transformers/models/musicgen/configuration_musicgen.py +0 -1
  933. transformers/models/musicgen/modeling_musicgen.py +113 -120
  934. transformers/models/musicgen/processing_musicgen.py +3 -21
  935. transformers/models/musicgen_melody/configuration_musicgen_melody.py +0 -1
  936. transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +8 -9
  937. transformers/models/musicgen_melody/modeling_musicgen_melody.py +110 -109
  938. transformers/models/musicgen_melody/processing_musicgen_melody.py +3 -22
  939. transformers/models/mvp/configuration_mvp.py +0 -1
  940. transformers/models/mvp/modeling_mvp.py +122 -119
  941. transformers/models/myt5/tokenization_myt5.py +8 -10
  942. transformers/models/nanochat/configuration_nanochat.py +0 -1
  943. transformers/models/nanochat/modeling_nanochat.py +33 -36
  944. transformers/models/nanochat/modular_nanochat.py +12 -14
  945. transformers/models/nemotron/configuration_nemotron.py +20 -23
  946. transformers/models/nemotron/modeling_nemotron.py +51 -54
  947. transformers/models/nllb/tokenization_nllb.py +7 -9
  948. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -1
  949. transformers/models/nllb_moe/modeling_nllb_moe.py +77 -69
  950. transformers/models/nougat/image_processing_nougat.py +29 -32
  951. transformers/models/nougat/image_processing_nougat_fast.py +4 -6
  952. transformers/models/nougat/processing_nougat.py +37 -39
  953. transformers/models/nougat/tokenization_nougat.py +16 -23
  954. transformers/models/nystromformer/configuration_nystromformer.py +0 -1
  955. transformers/models/nystromformer/modeling_nystromformer.py +68 -63
  956. transformers/models/olmo/configuration_olmo.py +18 -21
  957. transformers/models/olmo/modeling_olmo.py +32 -35
  958. transformers/models/olmo/modular_olmo.py +5 -9
  959. transformers/models/olmo2/configuration_olmo2.py +18 -21
  960. transformers/models/olmo2/modeling_olmo2.py +33 -36
  961. transformers/models/olmo2/modular_olmo2.py +29 -31
  962. transformers/models/olmo3/__init__.py +0 -1
  963. transformers/models/olmo3/configuration_olmo3.py +20 -23
  964. transformers/models/olmo3/modeling_olmo3.py +32 -35
  965. transformers/models/olmo3/modular_olmo3.py +31 -33
  966. transformers/models/olmoe/configuration_olmoe.py +24 -26
  967. transformers/models/olmoe/modeling_olmoe.py +49 -43
  968. transformers/models/olmoe/modular_olmoe.py +16 -15
  969. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -3
  970. transformers/models/omdet_turbo/modeling_omdet_turbo.py +42 -40
  971. transformers/models/omdet_turbo/processing_omdet_turbo.py +19 -67
  972. transformers/models/oneformer/configuration_oneformer.py +5 -8
  973. transformers/models/oneformer/image_processing_oneformer.py +83 -84
  974. transformers/models/oneformer/image_processing_oneformer_fast.py +33 -34
  975. transformers/models/oneformer/modeling_oneformer.py +130 -162
  976. transformers/models/oneformer/processing_oneformer.py +28 -43
  977. transformers/models/openai/configuration_openai.py +0 -1
  978. transformers/models/openai/modeling_openai.py +62 -51
  979. transformers/models/openai/tokenization_openai.py +2 -5
  980. transformers/models/opt/configuration_opt.py +0 -1
  981. transformers/models/opt/modeling_opt.py +74 -75
  982. transformers/models/ovis2/__init__.py +0 -1
  983. transformers/models/ovis2/configuration_ovis2.py +0 -1
  984. transformers/models/ovis2/image_processing_ovis2.py +22 -24
  985. transformers/models/ovis2/image_processing_ovis2_fast.py +6 -8
  986. transformers/models/ovis2/modeling_ovis2.py +58 -48
  987. transformers/models/ovis2/modular_ovis2.py +38 -32
  988. transformers/models/ovis2/processing_ovis2.py +12 -40
  989. transformers/models/owlv2/configuration_owlv2.py +0 -1
  990. transformers/models/owlv2/image_processing_owlv2.py +20 -21
  991. transformers/models/owlv2/image_processing_owlv2_fast.py +7 -10
  992. transformers/models/owlv2/modeling_owlv2.py +89 -90
  993. transformers/models/owlv2/modular_owlv2.py +6 -9
  994. transformers/models/owlv2/processing_owlv2.py +20 -49
  995. transformers/models/owlvit/configuration_owlvit.py +0 -1
  996. transformers/models/owlvit/image_processing_owlvit.py +21 -22
  997. transformers/models/owlvit/image_processing_owlvit_fast.py +2 -3
  998. transformers/models/owlvit/modeling_owlvit.py +88 -89
  999. transformers/models/owlvit/processing_owlvit.py +20 -48
  1000. transformers/models/paddleocr_vl/__init__.py +0 -1
  1001. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +19 -19
  1002. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +37 -37
  1003. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +12 -12
  1004. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +104 -90
  1005. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +90 -80
  1006. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +1 -3
  1007. transformers/models/paligemma/configuration_paligemma.py +0 -1
  1008. transformers/models/paligemma/modeling_paligemma.py +73 -67
  1009. transformers/models/paligemma/processing_paligemma.py +13 -66
  1010. transformers/models/parakeet/configuration_parakeet.py +1 -4
  1011. transformers/models/parakeet/feature_extraction_parakeet.py +10 -12
  1012. transformers/models/parakeet/modeling_parakeet.py +23 -22
  1013. transformers/models/parakeet/modular_parakeet.py +21 -18
  1014. transformers/models/parakeet/processing_parakeet.py +12 -5
  1015. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +5 -7
  1016. transformers/models/patchtsmixer/configuration_patchtsmixer.py +5 -8
  1017. transformers/models/patchtsmixer/modeling_patchtsmixer.py +64 -62
  1018. transformers/models/patchtst/configuration_patchtst.py +6 -9
  1019. transformers/models/patchtst/modeling_patchtst.py +77 -78
  1020. transformers/models/pe_audio/__init__.py +29 -0
  1021. transformers/models/pe_audio/configuration_pe_audio.py +204 -0
  1022. transformers/models/pe_audio/feature_extraction_pe_audio.py +160 -0
  1023. transformers/models/pe_audio/modeling_pe_audio.py +819 -0
  1024. transformers/models/pe_audio/modular_pe_audio.py +298 -0
  1025. transformers/models/pe_audio/processing_pe_audio.py +23 -0
  1026. transformers/models/pe_audio_video/__init__.py +28 -0
  1027. transformers/models/pe_audio_video/configuration_pe_audio_video.py +223 -0
  1028. transformers/models/pe_audio_video/modeling_pe_audio_video.py +971 -0
  1029. transformers/models/pe_audio_video/modular_pe_audio_video.py +763 -0
  1030. transformers/models/pe_audio_video/processing_pe_audio_video.py +24 -0
  1031. transformers/models/pe_video/__init__.py +29 -0
  1032. transformers/models/pe_video/configuration_pe_video.py +209 -0
  1033. transformers/models/pe_video/modeling_pe_video.py +635 -0
  1034. transformers/models/pe_video/modular_pe_video.py +218 -0
  1035. transformers/models/pe_video/processing_pe_video.py +10 -0
  1036. transformers/models/pe_video/video_processing_pe_video.py +64 -0
  1037. transformers/models/pegasus/configuration_pegasus.py +1 -1
  1038. transformers/models/pegasus/modeling_pegasus.py +66 -65
  1039. transformers/models/pegasus/tokenization_pegasus.py +1 -4
  1040. transformers/models/pegasus_x/configuration_pegasus_x.py +0 -1
  1041. transformers/models/pegasus_x/modeling_pegasus_x.py +51 -52
  1042. transformers/models/perceiver/configuration_perceiver.py +0 -1
  1043. transformers/models/perceiver/image_processing_perceiver.py +22 -25
  1044. transformers/models/perceiver/image_processing_perceiver_fast.py +5 -7
  1045. transformers/models/perceiver/modeling_perceiver.py +140 -137
  1046. transformers/models/perceiver/tokenization_perceiver.py +3 -6
  1047. transformers/models/perception_lm/configuration_perception_lm.py +0 -1
  1048. transformers/models/perception_lm/image_processing_perception_lm_fast.py +8 -10
  1049. transformers/models/perception_lm/modeling_perception_lm.py +45 -43
  1050. transformers/models/perception_lm/modular_perception_lm.py +38 -36
  1051. transformers/models/perception_lm/processing_perception_lm.py +13 -47
  1052. transformers/models/perception_lm/video_processing_perception_lm.py +0 -1
  1053. transformers/models/persimmon/configuration_persimmon.py +18 -21
  1054. transformers/models/persimmon/modeling_persimmon.py +40 -43
  1055. transformers/models/phi/configuration_phi.py +19 -22
  1056. transformers/models/phi/modeling_phi.py +36 -38
  1057. transformers/models/phi/modular_phi.py +23 -23
  1058. transformers/models/phi3/configuration_phi3.py +23 -26
  1059. transformers/models/phi3/modeling_phi3.py +34 -37
  1060. transformers/models/phi3/modular_phi3.py +13 -17
  1061. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +25 -26
  1062. transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +7 -9
  1063. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +7 -7
  1064. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +58 -57
  1065. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +62 -60
  1066. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +7 -44
  1067. transformers/models/phimoe/configuration_phimoe.py +26 -29
  1068. transformers/models/phimoe/modeling_phimoe.py +47 -42
  1069. transformers/models/phimoe/modular_phimoe.py +1 -2
  1070. transformers/models/phobert/tokenization_phobert.py +4 -6
  1071. transformers/models/pix2struct/configuration_pix2struct.py +0 -1
  1072. transformers/models/pix2struct/image_processing_pix2struct.py +15 -19
  1073. transformers/models/pix2struct/image_processing_pix2struct_fast.py +7 -10
  1074. transformers/models/pix2struct/modeling_pix2struct.py +42 -45
  1075. transformers/models/pix2struct/processing_pix2struct.py +5 -30
  1076. transformers/models/pixio/__init__.py +29 -0
  1077. transformers/models/pixio/configuration_pixio.py +150 -0
  1078. transformers/models/pixio/modeling_pixio.py +505 -0
  1079. transformers/models/pixio/modular_pixio.py +401 -0
  1080. transformers/models/pixtral/configuration_pixtral.py +11 -14
  1081. transformers/models/pixtral/image_processing_pixtral.py +26 -28
  1082. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -6
  1083. transformers/models/pixtral/modeling_pixtral.py +23 -26
  1084. transformers/models/pixtral/processing_pixtral.py +21 -53
  1085. transformers/models/plbart/configuration_plbart.py +1 -1
  1086. transformers/models/plbart/modeling_plbart.py +107 -102
  1087. transformers/models/plbart/modular_plbart.py +36 -32
  1088. transformers/models/plbart/tokenization_plbart.py +4 -5
  1089. transformers/models/poolformer/configuration_poolformer.py +0 -1
  1090. transformers/models/poolformer/image_processing_poolformer.py +21 -24
  1091. transformers/models/poolformer/image_processing_poolformer_fast.py +6 -8
  1092. transformers/models/poolformer/modeling_poolformer.py +21 -13
  1093. transformers/models/pop2piano/configuration_pop2piano.py +0 -2
  1094. transformers/models/pop2piano/feature_extraction_pop2piano.py +6 -9
  1095. transformers/models/pop2piano/modeling_pop2piano.py +22 -23
  1096. transformers/models/pop2piano/processing_pop2piano.py +25 -33
  1097. transformers/models/pop2piano/tokenization_pop2piano.py +15 -23
  1098. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +3 -3
  1099. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
  1100. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +14 -15
  1101. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +9 -10
  1102. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +9 -10
  1103. transformers/models/prophetnet/configuration_prophetnet.py +26 -28
  1104. transformers/models/prophetnet/modeling_prophetnet.py +111 -131
  1105. transformers/models/prophetnet/tokenization_prophetnet.py +14 -16
  1106. transformers/models/pvt/configuration_pvt.py +0 -1
  1107. transformers/models/pvt/image_processing_pvt.py +17 -20
  1108. transformers/models/pvt/image_processing_pvt_fast.py +0 -1
  1109. transformers/models/pvt/modeling_pvt.py +19 -21
  1110. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  1111. transformers/models/pvt_v2/modeling_pvt_v2.py +21 -23
  1112. transformers/models/qwen2/configuration_qwen2.py +18 -21
  1113. transformers/models/qwen2/modeling_qwen2.py +32 -34
  1114. transformers/models/qwen2/modular_qwen2.py +11 -12
  1115. transformers/models/qwen2/tokenization_qwen2.py +2 -5
  1116. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +20 -23
  1117. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +239 -192
  1118. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +174 -127
  1119. transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +41 -49
  1120. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +22 -25
  1121. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +112 -101
  1122. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +72 -107
  1123. transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +7 -43
  1124. transformers/models/qwen2_audio/configuration_qwen2_audio.py +0 -1
  1125. transformers/models/qwen2_audio/modeling_qwen2_audio.py +29 -31
  1126. transformers/models/qwen2_audio/processing_qwen2_audio.py +13 -42
  1127. transformers/models/qwen2_moe/configuration_qwen2_moe.py +28 -31
  1128. transformers/models/qwen2_moe/modeling_qwen2_moe.py +48 -43
  1129. transformers/models/qwen2_moe/modular_qwen2_moe.py +7 -10
  1130. transformers/models/qwen2_vl/configuration_qwen2_vl.py +22 -24
  1131. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +41 -42
  1132. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +8 -9
  1133. transformers/models/qwen2_vl/modeling_qwen2_vl.py +108 -96
  1134. transformers/models/qwen2_vl/processing_qwen2_vl.py +7 -44
  1135. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +35 -13
  1136. transformers/models/qwen3/configuration_qwen3.py +20 -23
  1137. transformers/models/qwen3/modeling_qwen3.py +32 -35
  1138. transformers/models/qwen3/modular_qwen3.py +4 -6
  1139. transformers/models/qwen3_moe/configuration_qwen3_moe.py +25 -28
  1140. transformers/models/qwen3_moe/modeling_qwen3_moe.py +48 -43
  1141. transformers/models/qwen3_moe/modular_qwen3_moe.py +10 -13
  1142. transformers/models/qwen3_next/configuration_qwen3_next.py +31 -34
  1143. transformers/models/qwen3_next/modeling_qwen3_next.py +43 -48
  1144. transformers/models/qwen3_next/modular_qwen3_next.py +33 -34
  1145. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +89 -88
  1146. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +199 -156
  1147. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +170 -152
  1148. transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +40 -48
  1149. transformers/models/qwen3_vl/configuration_qwen3_vl.py +21 -24
  1150. transformers/models/qwen3_vl/modeling_qwen3_vl.py +91 -81
  1151. transformers/models/qwen3_vl/modular_qwen3_vl.py +86 -112
  1152. transformers/models/qwen3_vl/processing_qwen3_vl.py +6 -42
  1153. transformers/models/qwen3_vl/video_processing_qwen3_vl.py +10 -12
  1154. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +21 -25
  1155. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +174 -195
  1156. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +65 -117
  1157. transformers/models/rag/configuration_rag.py +0 -9
  1158. transformers/models/rag/modeling_rag.py +123 -127
  1159. transformers/models/rag/retrieval_rag.py +2 -4
  1160. transformers/models/rag/tokenization_rag.py +0 -50
  1161. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +21 -24
  1162. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +34 -36
  1163. transformers/models/reformer/configuration_reformer.py +0 -1
  1164. transformers/models/reformer/modeling_reformer.py +76 -69
  1165. transformers/models/reformer/tokenization_reformer.py +3 -6
  1166. transformers/models/regnet/configuration_regnet.py +0 -1
  1167. transformers/models/regnet/modeling_regnet.py +11 -9
  1168. transformers/models/rembert/configuration_rembert.py +0 -1
  1169. transformers/models/rembert/modeling_rembert.py +115 -111
  1170. transformers/models/rembert/tokenization_rembert.py +1 -4
  1171. transformers/models/resnet/configuration_resnet.py +0 -1
  1172. transformers/models/resnet/modeling_resnet.py +16 -13
  1173. transformers/models/roberta/configuration_roberta.py +0 -1
  1174. transformers/models/roberta/modeling_roberta.py +94 -93
  1175. transformers/models/roberta/modular_roberta.py +58 -58
  1176. transformers/models/roberta/tokenization_roberta.py +2 -5
  1177. transformers/models/roberta/tokenization_roberta_old.py +2 -4
  1178. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +0 -1
  1179. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +94 -93
  1180. transformers/models/roc_bert/configuration_roc_bert.py +0 -1
  1181. transformers/models/roc_bert/modeling_roc_bert.py +122 -121
  1182. transformers/models/roc_bert/tokenization_roc_bert.py +88 -94
  1183. transformers/models/roformer/configuration_roformer.py +0 -1
  1184. transformers/models/roformer/modeling_roformer.py +79 -81
  1185. transformers/models/roformer/tokenization_roformer.py +3 -6
  1186. transformers/models/roformer/tokenization_utils.py +0 -1
  1187. transformers/models/rt_detr/configuration_rt_detr.py +1 -2
  1188. transformers/models/rt_detr/configuration_rt_detr_resnet.py +0 -1
  1189. transformers/models/rt_detr/image_processing_rt_detr.py +54 -55
  1190. transformers/models/rt_detr/image_processing_rt_detr_fast.py +15 -15
  1191. transformers/models/rt_detr/modeling_rt_detr.py +84 -82
  1192. transformers/models/rt_detr/modeling_rt_detr_resnet.py +10 -7
  1193. transformers/models/rt_detr/modular_rt_detr.py +14 -14
  1194. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -4
  1195. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +86 -81
  1196. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +10 -7
  1197. transformers/models/rwkv/configuration_rwkv.py +0 -1
  1198. transformers/models/rwkv/modeling_rwkv.py +30 -32
  1199. transformers/models/sam/configuration_sam.py +1 -1
  1200. transformers/models/sam/image_processing_sam.py +59 -60
  1201. transformers/models/sam/image_processing_sam_fast.py +21 -23
  1202. transformers/models/sam/modeling_sam.py +37 -36
  1203. transformers/models/sam/processing_sam.py +39 -27
  1204. transformers/models/sam2/configuration_sam2.py +1 -2
  1205. transformers/models/sam2/image_processing_sam2_fast.py +14 -15
  1206. transformers/models/sam2/modeling_sam2.py +50 -48
  1207. transformers/models/sam2/modular_sam2.py +48 -45
  1208. transformers/models/sam2/processing_sam2.py +31 -47
  1209. transformers/models/sam2_video/configuration_sam2_video.py +0 -1
  1210. transformers/models/sam2_video/modeling_sam2_video.py +119 -112
  1211. transformers/models/sam2_video/modular_sam2_video.py +91 -97
  1212. transformers/models/sam2_video/processing_sam2_video.py +49 -66
  1213. transformers/models/sam2_video/video_processing_sam2_video.py +1 -4
  1214. transformers/models/sam3/configuration_sam3.py +21 -2
  1215. transformers/models/sam3/image_processing_sam3_fast.py +17 -20
  1216. transformers/models/sam3/modeling_sam3.py +77 -56
  1217. transformers/models/sam3/modular_sam3.py +3 -8
  1218. transformers/models/sam3/processing_sam3.py +29 -48
  1219. transformers/models/sam3_tracker/__init__.py +0 -1
  1220. transformers/models/sam3_tracker/configuration_sam3_tracker.py +0 -1
  1221. transformers/models/sam3_tracker/modeling_sam3_tracker.py +36 -36
  1222. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -1
  1223. transformers/models/sam3_tracker/processing_sam3_tracker.py +31 -47
  1224. transformers/models/sam3_tracker_video/__init__.py +0 -1
  1225. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -1
  1226. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +96 -85
  1227. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +27 -6
  1228. transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +50 -66
  1229. transformers/models/sam3_video/configuration_sam3_video.py +14 -1
  1230. transformers/models/sam3_video/modeling_sam3_video.py +32 -34
  1231. transformers/models/sam3_video/processing_sam3_video.py +26 -46
  1232. transformers/models/sam_hq/__init__.py +1 -1
  1233. transformers/models/sam_hq/configuration_sam_hq.py +1 -1
  1234. transformers/models/sam_hq/modeling_sam_hq.py +65 -64
  1235. transformers/models/sam_hq/modular_sam_hq.py +17 -19
  1236. transformers/models/sam_hq/{processing_samhq.py → processing_sam_hq.py} +39 -28
  1237. transformers/models/seamless_m4t/configuration_seamless_m4t.py +0 -1
  1238. transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +8 -11
  1239. transformers/models/seamless_m4t/modeling_seamless_m4t.py +207 -193
  1240. transformers/models/seamless_m4t/processing_seamless_m4t.py +18 -39
  1241. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +15 -20
  1242. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +0 -1
  1243. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +199 -195
  1244. transformers/models/seed_oss/configuration_seed_oss.py +23 -25
  1245. transformers/models/seed_oss/modeling_seed_oss.py +31 -33
  1246. transformers/models/seed_oss/modular_seed_oss.py +3 -4
  1247. transformers/models/segformer/configuration_segformer.py +0 -10
  1248. transformers/models/segformer/image_processing_segformer.py +39 -42
  1249. transformers/models/segformer/image_processing_segformer_fast.py +7 -9
  1250. transformers/models/segformer/modeling_segformer.py +26 -28
  1251. transformers/models/segformer/modular_segformer.py +5 -7
  1252. transformers/models/seggpt/configuration_seggpt.py +0 -1
  1253. transformers/models/seggpt/image_processing_seggpt.py +38 -41
  1254. transformers/models/seggpt/modeling_seggpt.py +28 -30
  1255. transformers/models/sew/configuration_sew.py +0 -1
  1256. transformers/models/sew/modeling_sew.py +33 -35
  1257. transformers/models/sew/modular_sew.py +10 -12
  1258. transformers/models/sew_d/configuration_sew_d.py +0 -1
  1259. transformers/models/sew_d/modeling_sew_d.py +28 -30
  1260. transformers/models/shieldgemma2/configuration_shieldgemma2.py +0 -1
  1261. transformers/models/shieldgemma2/modeling_shieldgemma2.py +16 -17
  1262. transformers/models/shieldgemma2/processing_shieldgemma2.py +3 -5
  1263. transformers/models/siglip/configuration_siglip.py +0 -1
  1264. transformers/models/siglip/image_processing_siglip.py +17 -20
  1265. transformers/models/siglip/image_processing_siglip_fast.py +0 -1
  1266. transformers/models/siglip/modeling_siglip.py +62 -41
  1267. transformers/models/siglip/processing_siglip.py +2 -14
  1268. transformers/models/siglip/tokenization_siglip.py +6 -7
  1269. transformers/models/siglip2/configuration_siglip2.py +1 -1
  1270. transformers/models/siglip2/image_processing_siglip2.py +15 -16
  1271. transformers/models/siglip2/image_processing_siglip2_fast.py +4 -5
  1272. transformers/models/siglip2/modeling_siglip2.py +114 -92
  1273. transformers/models/siglip2/modular_siglip2.py +23 -25
  1274. transformers/models/siglip2/processing_siglip2.py +2 -14
  1275. transformers/models/smollm3/configuration_smollm3.py +23 -26
  1276. transformers/models/smollm3/modeling_smollm3.py +32 -35
  1277. transformers/models/smollm3/modular_smollm3.py +27 -29
  1278. transformers/models/smolvlm/configuration_smolvlm.py +1 -1
  1279. transformers/models/smolvlm/image_processing_smolvlm.py +42 -43
  1280. transformers/models/smolvlm/image_processing_smolvlm_fast.py +12 -12
  1281. transformers/models/smolvlm/modeling_smolvlm.py +56 -53
  1282. transformers/models/smolvlm/modular_smolvlm.py +15 -17
  1283. transformers/models/smolvlm/processing_smolvlm.py +15 -76
  1284. transformers/models/smolvlm/video_processing_smolvlm.py +7 -9
  1285. transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +0 -1
  1286. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +20 -23
  1287. transformers/models/speech_to_text/configuration_speech_to_text.py +0 -1
  1288. transformers/models/speech_to_text/feature_extraction_speech_to_text.py +10 -13
  1289. transformers/models/speech_to_text/modeling_speech_to_text.py +62 -54
  1290. transformers/models/speech_to_text/processing_speech_to_text.py +4 -30
  1291. transformers/models/speech_to_text/tokenization_speech_to_text.py +5 -6
  1292. transformers/models/speecht5/configuration_speecht5.py +0 -1
  1293. transformers/models/speecht5/feature_extraction_speecht5.py +16 -37
  1294. transformers/models/speecht5/modeling_speecht5.py +200 -174
  1295. transformers/models/speecht5/number_normalizer.py +0 -1
  1296. transformers/models/speecht5/processing_speecht5.py +3 -37
  1297. transformers/models/speecht5/tokenization_speecht5.py +4 -5
  1298. transformers/models/splinter/configuration_splinter.py +0 -1
  1299. transformers/models/splinter/modeling_splinter.py +63 -59
  1300. transformers/models/splinter/tokenization_splinter.py +2 -4
  1301. transformers/models/squeezebert/configuration_squeezebert.py +0 -1
  1302. transformers/models/squeezebert/modeling_squeezebert.py +62 -62
  1303. transformers/models/squeezebert/tokenization_squeezebert.py +0 -1
  1304. transformers/models/stablelm/configuration_stablelm.py +20 -23
  1305. transformers/models/stablelm/modeling_stablelm.py +40 -43
  1306. transformers/models/starcoder2/configuration_starcoder2.py +19 -22
  1307. transformers/models/starcoder2/modeling_starcoder2.py +34 -37
  1308. transformers/models/starcoder2/modular_starcoder2.py +13 -15
  1309. transformers/models/superglue/configuration_superglue.py +3 -3
  1310. transformers/models/superglue/image_processing_superglue.py +15 -15
  1311. transformers/models/superglue/image_processing_superglue_fast.py +5 -7
  1312. transformers/models/superglue/modeling_superglue.py +32 -33
  1313. transformers/models/superpoint/image_processing_superpoint.py +15 -15
  1314. transformers/models/superpoint/image_processing_superpoint_fast.py +5 -7
  1315. transformers/models/superpoint/modeling_superpoint.py +13 -14
  1316. transformers/models/swiftformer/configuration_swiftformer.py +0 -1
  1317. transformers/models/swiftformer/modeling_swiftformer.py +16 -14
  1318. transformers/models/swin/configuration_swin.py +0 -1
  1319. transformers/models/swin/modeling_swin.py +74 -82
  1320. transformers/models/swin2sr/configuration_swin2sr.py +0 -1
  1321. transformers/models/swin2sr/image_processing_swin2sr.py +10 -13
  1322. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -6
  1323. transformers/models/swin2sr/modeling_swin2sr.py +75 -61
  1324. transformers/models/swinv2/configuration_swinv2.py +0 -1
  1325. transformers/models/swinv2/modeling_swinv2.py +96 -100
  1326. transformers/models/switch_transformers/configuration_switch_transformers.py +0 -1
  1327. transformers/models/switch_transformers/modeling_switch_transformers.py +34 -41
  1328. transformers/models/switch_transformers/modular_switch_transformers.py +31 -38
  1329. transformers/models/t5/configuration_t5.py +7 -2
  1330. transformers/models/t5/modeling_t5.py +76 -84
  1331. transformers/models/t5/tokenization_t5.py +1 -3
  1332. transformers/models/t5gemma/configuration_t5gemma.py +33 -34
  1333. transformers/models/t5gemma/modeling_t5gemma.py +97 -100
  1334. transformers/models/t5gemma/modular_t5gemma.py +117 -118
  1335. transformers/models/t5gemma2/configuration_t5gemma2.py +59 -96
  1336. transformers/models/t5gemma2/modeling_t5gemma2.py +109 -103
  1337. transformers/models/t5gemma2/modular_t5gemma2.py +375 -91
  1338. transformers/models/table_transformer/configuration_table_transformer.py +1 -2
  1339. transformers/models/table_transformer/modeling_table_transformer.py +47 -49
  1340. transformers/models/tapas/configuration_tapas.py +0 -1
  1341. transformers/models/tapas/modeling_tapas.py +64 -66
  1342. transformers/models/tapas/tokenization_tapas.py +115 -153
  1343. transformers/models/textnet/configuration_textnet.py +0 -1
  1344. transformers/models/textnet/image_processing_textnet.py +22 -25
  1345. transformers/models/textnet/image_processing_textnet_fast.py +5 -7
  1346. transformers/models/textnet/modeling_textnet.py +13 -14
  1347. transformers/models/time_series_transformer/configuration_time_series_transformer.py +5 -8
  1348. transformers/models/time_series_transformer/modeling_time_series_transformer.py +79 -81
  1349. transformers/models/timesfm/configuration_timesfm.py +0 -1
  1350. transformers/models/timesfm/modeling_timesfm.py +29 -19
  1351. transformers/models/timesfm/modular_timesfm.py +28 -18
  1352. transformers/models/timesformer/configuration_timesformer.py +0 -1
  1353. transformers/models/timesformer/modeling_timesformer.py +13 -16
  1354. transformers/models/timm_backbone/configuration_timm_backbone.py +0 -1
  1355. transformers/models/timm_backbone/modeling_timm_backbone.py +17 -15
  1356. transformers/models/timm_wrapper/configuration_timm_wrapper.py +5 -3
  1357. transformers/models/timm_wrapper/image_processing_timm_wrapper.py +4 -5
  1358. transformers/models/timm_wrapper/modeling_timm_wrapper.py +32 -28
  1359. transformers/models/trocr/configuration_trocr.py +0 -1
  1360. transformers/models/trocr/modeling_trocr.py +39 -42
  1361. transformers/models/trocr/processing_trocr.py +5 -25
  1362. transformers/models/tvp/configuration_tvp.py +5 -2
  1363. transformers/models/tvp/image_processing_tvp.py +50 -52
  1364. transformers/models/tvp/image_processing_tvp_fast.py +9 -10
  1365. transformers/models/tvp/modeling_tvp.py +25 -27
  1366. transformers/models/tvp/processing_tvp.py +2 -14
  1367. transformers/models/udop/configuration_udop.py +1 -1
  1368. transformers/models/udop/modeling_udop.py +63 -70
  1369. transformers/models/udop/processing_udop.py +7 -26
  1370. transformers/models/udop/tokenization_udop.py +80 -93
  1371. transformers/models/umt5/configuration_umt5.py +2 -3
  1372. transformers/models/umt5/modeling_umt5.py +80 -87
  1373. transformers/models/unispeech/configuration_unispeech.py +0 -1
  1374. transformers/models/unispeech/modeling_unispeech.py +47 -49
  1375. transformers/models/unispeech/modular_unispeech.py +20 -22
  1376. transformers/models/unispeech_sat/configuration_unispeech_sat.py +0 -1
  1377. transformers/models/unispeech_sat/modeling_unispeech_sat.py +63 -65
  1378. transformers/models/unispeech_sat/modular_unispeech_sat.py +21 -23
  1379. transformers/models/univnet/feature_extraction_univnet.py +14 -14
  1380. transformers/models/univnet/modeling_univnet.py +7 -8
  1381. transformers/models/upernet/configuration_upernet.py +0 -1
  1382. transformers/models/upernet/modeling_upernet.py +10 -13
  1383. transformers/models/vaultgemma/__init__.py +0 -1
  1384. transformers/models/vaultgemma/configuration_vaultgemma.py +24 -26
  1385. transformers/models/vaultgemma/modeling_vaultgemma.py +35 -37
  1386. transformers/models/vaultgemma/modular_vaultgemma.py +29 -31
  1387. transformers/models/video_llama_3/image_processing_video_llama_3.py +43 -42
  1388. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +8 -8
  1389. transformers/models/video_llama_3/modeling_video_llama_3.py +77 -66
  1390. transformers/models/video_llama_3/modular_video_llama_3.py +110 -112
  1391. transformers/models/video_llama_3/processing_video_llama_3.py +5 -39
  1392. transformers/models/video_llama_3/video_processing_video_llama_3.py +18 -18
  1393. transformers/models/video_llava/configuration_video_llava.py +0 -1
  1394. transformers/models/video_llava/image_processing_video_llava.py +35 -38
  1395. transformers/models/video_llava/modeling_video_llava.py +59 -57
  1396. transformers/models/video_llava/processing_video_llava.py +38 -78
  1397. transformers/models/video_llava/video_processing_video_llava.py +0 -1
  1398. transformers/models/videomae/configuration_videomae.py +0 -1
  1399. transformers/models/videomae/image_processing_videomae.py +31 -34
  1400. transformers/models/videomae/modeling_videomae.py +13 -15
  1401. transformers/models/videomae/video_processing_videomae.py +0 -1
  1402. transformers/models/vilt/configuration_vilt.py +2 -3
  1403. transformers/models/vilt/image_processing_vilt.py +29 -30
  1404. transformers/models/vilt/image_processing_vilt_fast.py +9 -10
  1405. transformers/models/vilt/modeling_vilt.py +83 -78
  1406. transformers/models/vilt/processing_vilt.py +2 -14
  1407. transformers/models/vipllava/configuration_vipllava.py +0 -1
  1408. transformers/models/vipllava/modeling_vipllava.py +45 -42
  1409. transformers/models/vipllava/modular_vipllava.py +30 -32
  1410. transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +0 -1
  1411. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +18 -21
  1412. transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +0 -1
  1413. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +18 -21
  1414. transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +2 -16
  1415. transformers/models/visual_bert/configuration_visual_bert.py +0 -1
  1416. transformers/models/visual_bert/modeling_visual_bert.py +92 -92
  1417. transformers/models/vit/configuration_vit.py +0 -1
  1418. transformers/models/vit/image_processing_vit.py +19 -22
  1419. transformers/models/vit/image_processing_vit_fast.py +0 -1
  1420. transformers/models/vit/modeling_vit.py +13 -15
  1421. transformers/models/vit_mae/configuration_vit_mae.py +0 -1
  1422. transformers/models/vit_mae/modeling_vit_mae.py +21 -23
  1423. transformers/models/vit_msn/configuration_vit_msn.py +0 -1
  1424. transformers/models/vit_msn/modeling_vit_msn.py +10 -12
  1425. transformers/models/vitdet/configuration_vitdet.py +0 -1
  1426. transformers/models/vitdet/modeling_vitdet.py +12 -14
  1427. transformers/models/vitmatte/configuration_vitmatte.py +2 -5
  1428. transformers/models/vitmatte/image_processing_vitmatte.py +15 -18
  1429. transformers/models/vitmatte/image_processing_vitmatte_fast.py +14 -16
  1430. transformers/models/vitmatte/modeling_vitmatte.py +13 -11
  1431. transformers/models/vitpose/configuration_vitpose.py +4 -7
  1432. transformers/models/vitpose/image_processing_vitpose.py +24 -25
  1433. transformers/models/vitpose/image_processing_vitpose_fast.py +9 -11
  1434. transformers/models/vitpose/modeling_vitpose.py +10 -12
  1435. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +0 -1
  1436. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +8 -10
  1437. transformers/models/vits/configuration_vits.py +0 -1
  1438. transformers/models/vits/modeling_vits.py +34 -35
  1439. transformers/models/vits/tokenization_vits.py +3 -4
  1440. transformers/models/vivit/configuration_vivit.py +0 -1
  1441. transformers/models/vivit/image_processing_vivit.py +36 -39
  1442. transformers/models/vivit/modeling_vivit.py +5 -7
  1443. transformers/models/vjepa2/__init__.py +0 -1
  1444. transformers/models/vjepa2/configuration_vjepa2.py +0 -1
  1445. transformers/models/vjepa2/modeling_vjepa2.py +30 -32
  1446. transformers/models/vjepa2/video_processing_vjepa2.py +0 -1
  1447. transformers/models/voxtral/__init__.py +0 -1
  1448. transformers/models/voxtral/configuration_voxtral.py +0 -1
  1449. transformers/models/voxtral/modeling_voxtral.py +19 -27
  1450. transformers/models/voxtral/modular_voxtral.py +12 -21
  1451. transformers/models/voxtral/processing_voxtral.py +25 -48
  1452. transformers/models/wav2vec2/configuration_wav2vec2.py +0 -1
  1453. transformers/models/wav2vec2/feature_extraction_wav2vec2.py +7 -10
  1454. transformers/models/wav2vec2/modeling_wav2vec2.py +67 -122
  1455. transformers/models/wav2vec2/processing_wav2vec2.py +6 -35
  1456. transformers/models/wav2vec2/tokenization_wav2vec2.py +20 -332
  1457. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +0 -1
  1458. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +65 -62
  1459. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +52 -48
  1460. transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +6 -35
  1461. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +0 -1
  1462. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +84 -77
  1463. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +37 -30
  1464. transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +16 -17
  1465. transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +36 -55
  1466. transformers/models/wavlm/configuration_wavlm.py +0 -1
  1467. transformers/models/wavlm/modeling_wavlm.py +45 -48
  1468. transformers/models/wavlm/modular_wavlm.py +4 -5
  1469. transformers/models/whisper/configuration_whisper.py +0 -1
  1470. transformers/models/whisper/english_normalizer.py +3 -4
  1471. transformers/models/whisper/feature_extraction_whisper.py +9 -24
  1472. transformers/models/whisper/generation_whisper.py +27 -48
  1473. transformers/models/whisper/modeling_whisper.py +73 -73
  1474. transformers/models/whisper/processing_whisper.py +3 -20
  1475. transformers/models/whisper/tokenization_whisper.py +9 -30
  1476. transformers/models/x_clip/configuration_x_clip.py +0 -1
  1477. transformers/models/x_clip/modeling_x_clip.py +70 -69
  1478. transformers/models/x_clip/processing_x_clip.py +2 -14
  1479. transformers/models/xcodec/configuration_xcodec.py +4 -6
  1480. transformers/models/xcodec/modeling_xcodec.py +20 -17
  1481. transformers/models/xglm/configuration_xglm.py +0 -1
  1482. transformers/models/xglm/modeling_xglm.py +59 -55
  1483. transformers/models/xglm/tokenization_xglm.py +1 -4
  1484. transformers/models/xlm/configuration_xlm.py +0 -1
  1485. transformers/models/xlm/modeling_xlm.py +139 -144
  1486. transformers/models/xlm/tokenization_xlm.py +3 -5
  1487. transformers/models/xlm_roberta/configuration_xlm_roberta.py +0 -1
  1488. transformers/models/xlm_roberta/modeling_xlm_roberta.py +195 -194
  1489. transformers/models/xlm_roberta/modular_xlm_roberta.py +50 -53
  1490. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +1 -4
  1491. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +0 -1
  1492. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +94 -93
  1493. transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +67 -70
  1494. transformers/models/xlnet/configuration_xlnet.py +0 -11
  1495. transformers/models/xlnet/modeling_xlnet.py +152 -163
  1496. transformers/models/xlnet/tokenization_xlnet.py +1 -4
  1497. transformers/models/xlstm/configuration_xlstm.py +3 -5
  1498. transformers/models/xlstm/modeling_xlstm.py +62 -65
  1499. transformers/models/xmod/configuration_xmod.py +0 -1
  1500. transformers/models/xmod/modeling_xmod.py +101 -100
  1501. transformers/models/yolos/configuration_yolos.py +0 -1
  1502. transformers/models/yolos/image_processing_yolos.py +60 -62
  1503. transformers/models/yolos/image_processing_yolos_fast.py +18 -18
  1504. transformers/models/yolos/modeling_yolos.py +12 -14
  1505. transformers/models/yolos/modular_yolos.py +2 -4
  1506. transformers/models/yoso/configuration_yoso.py +0 -1
  1507. transformers/models/yoso/modeling_yoso.py +64 -63
  1508. transformers/models/zamba/configuration_zamba.py +0 -1
  1509. transformers/models/zamba/modeling_zamba.py +70 -70
  1510. transformers/models/zamba2/configuration_zamba2.py +36 -37
  1511. transformers/models/zamba2/modeling_zamba2.py +87 -89
  1512. transformers/models/zamba2/modular_zamba2.py +43 -45
  1513. transformers/models/zoedepth/configuration_zoedepth.py +1 -2
  1514. transformers/models/zoedepth/image_processing_zoedepth.py +28 -29
  1515. transformers/models/zoedepth/image_processing_zoedepth_fast.py +12 -15
  1516. transformers/models/zoedepth/modeling_zoedepth.py +21 -16
  1517. transformers/pipelines/__init__.py +59 -55
  1518. transformers/pipelines/any_to_any.py +14 -22
  1519. transformers/pipelines/audio_utils.py +1 -2
  1520. transformers/pipelines/automatic_speech_recognition.py +20 -12
  1521. transformers/pipelines/base.py +13 -17
  1522. transformers/pipelines/deprecated/__init__.py +0 -1
  1523. transformers/pipelines/document_question_answering.py +1 -1
  1524. transformers/pipelines/image_text_to_text.py +0 -1
  1525. transformers/pipelines/image_to_text.py +4 -44
  1526. transformers/pipelines/question_answering.py +5 -44
  1527. transformers/pipelines/text_classification.py +1 -14
  1528. transformers/pipelines/text_to_audio.py +2 -2
  1529. transformers/pipelines/token_classification.py +1 -22
  1530. transformers/pipelines/video_classification.py +1 -9
  1531. transformers/pipelines/zero_shot_audio_classification.py +0 -1
  1532. transformers/pipelines/zero_shot_classification.py +0 -6
  1533. transformers/pipelines/zero_shot_image_classification.py +0 -7
  1534. transformers/processing_utils.py +222 -151
  1535. transformers/quantizers/auto.py +2 -4
  1536. transformers/quantizers/base.py +19 -64
  1537. transformers/quantizers/quantizer_aqlm.py +1 -18
  1538. transformers/quantizers/quantizer_auto_round.py +1 -10
  1539. transformers/quantizers/quantizer_awq.py +3 -8
  1540. transformers/quantizers/quantizer_bitnet.py +1 -6
  1541. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  1542. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  1543. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  1544. transformers/quantizers/quantizer_eetq.py +2 -12
  1545. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  1546. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  1547. transformers/quantizers/quantizer_fp_quant.py +4 -4
  1548. transformers/quantizers/quantizer_gptq.py +1 -4
  1549. transformers/quantizers/quantizer_higgs.py +2 -6
  1550. transformers/quantizers/quantizer_mxfp4.py +2 -28
  1551. transformers/quantizers/quantizer_quanto.py +14 -14
  1552. transformers/quantizers/quantizer_quark.py +0 -1
  1553. transformers/quantizers/quantizer_spqr.py +3 -8
  1554. transformers/quantizers/quantizer_torchao.py +31 -127
  1555. transformers/quantizers/quantizer_vptq.py +1 -10
  1556. transformers/testing_utils.py +31 -49
  1557. transformers/tokenization_mistral_common.py +554 -902
  1558. transformers/tokenization_utils_base.py +112 -124
  1559. transformers/tokenization_utils_sentencepiece.py +5 -6
  1560. transformers/tokenization_utils_tokenizers.py +30 -7
  1561. transformers/trainer.py +30 -11
  1562. transformers/trainer_callback.py +8 -0
  1563. transformers/trainer_jit_checkpoint.py +1 -2
  1564. transformers/trainer_seq2seq.py +4 -0
  1565. transformers/training_args.py +11 -13
  1566. transformers/utils/__init__.py +4 -0
  1567. transformers/utils/attention_visualizer.py +5 -5
  1568. transformers/utils/auto_docstring.py +598 -37
  1569. transformers/utils/doc.py +1 -1
  1570. transformers/utils/dummy_pt_objects.py +0 -42
  1571. transformers/utils/generic.py +21 -1
  1572. transformers/utils/import_utils.py +51 -9
  1573. transformers/utils/kernel_config.py +71 -18
  1574. transformers/utils/loading_report.py +3 -3
  1575. transformers/utils/quantization_config.py +16 -18
  1576. transformers/video_processing_utils.py +35 -32
  1577. transformers/video_utils.py +18 -22
  1578. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/METADATA +23 -24
  1579. transformers-5.0.0rc3.dist-info/RECORD +2067 -0
  1580. transformers-5.0.0rc1.dist-info/RECORD +0 -2003
  1581. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/WHEEL +0 -0
  1582. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/entry_points.txt +0 -0
  1583. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/licenses/LICENSE +0 -0
  1584. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,3 @@
1
- # coding=utf-8
2
1
  # Copyright 2022 The HuggingFace Inc. team.
3
2
  #
4
3
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,12 +19,11 @@ import os
20
19
  from abc import ABC, abstractmethod
21
20
  from collections.abc import Callable
22
21
  from dataclasses import dataclass, is_dataclass
23
- from typing import TYPE_CHECKING, Any, Optional
22
+ from typing import TYPE_CHECKING, Any, Optional, Union
24
23
 
25
24
  from huggingface_hub import create_repo
26
25
 
27
26
  from .. import __version__
28
- from ..configuration_utils import PreTrainedConfig
29
27
  from ..utils import (
30
28
  GENERATION_CONFIG_NAME,
31
29
  ExplicitEnum,
@@ -38,6 +36,7 @@ from ..utils import (
38
36
 
39
37
 
40
38
  if TYPE_CHECKING:
39
+ from ..configuration_utils import PreTrainedConfig
41
40
  from ..modeling_utils import PreTrainedModel
42
41
 
43
42
 
@@ -101,21 +100,25 @@ class GenerationConfig(PushToHubMixin):
101
100
 
102
101
  </Tip>
103
102
 
103
+ Note: the configuration field that are still `None` will be overriden by `GenerationConfig._get_default_generation_params()`
104
+ during the generation loop. If you want to use different values for these fields, make sure to explicitly set them in the
105
+ generation config.
106
+
104
107
  Arg:
105
108
  > Parameters that control the length of the output
106
109
 
107
- max_length (`int`, *optional*, defaults to 20):
110
+ max_length (`int`, *optional*):
108
111
  `max_new_tokens` is recommended for controlling how many tokens the model generates.
109
112
  `max_length` remains for backward compatibility.
110
113
 
111
114
  max_new_tokens (`int`, *optional*):
112
115
  The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
113
- min_length (`int`, *optional*, defaults to 0):
116
+ min_length (`int`, *optional*):
114
117
  The minimum length of the sequence to be generated. Corresponds to the length of the input prompt +
115
118
  `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.
116
119
  min_new_tokens (`int`, *optional*):
117
120
  The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.
118
- early_stopping (`bool` or `str`, *optional*, defaults to `False`):
121
+ early_stopping (`bool` or `str`, *optional*):
119
122
  Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
120
123
  `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
121
124
  heuristic is applied and the generation stops when is it very unlikely to find better candidates;
@@ -129,17 +132,17 @@ class GenerationConfig(PushToHubMixin):
129
132
 
130
133
  > Parameters that control the generation strategy used
131
134
 
132
- do_sample (`bool`, *optional*, defaults to `False`):
135
+ do_sample (`bool`):
133
136
  Whether or not to use sampling ; use greedy decoding otherwise.
134
- num_beams (`int`, *optional*, defaults to 1):
137
+ num_beams (`int`, *optional*):
135
138
  Number of beams for beam search. 1 means no beam search.
136
139
 
137
140
  > Parameters that control the cache
138
141
 
139
- use_cache (`bool`, *optional*, defaults to `True`):
142
+ use_cache (`bool`):
140
143
  Whether or not the model should use the past last key/values attentions (if applicable to the model) to
141
144
  speed up decoding.
142
- cache_implementation (`str`, *optional*, default to `None`):
145
+ cache_implementation (`str`, *optional*):
143
146
  Name of the cache class that will be instantiated in `generate`, for faster decoding. Possible values are:
144
147
 
145
148
  - `"dynamic"`: [`DynamicCache`]
@@ -155,11 +158,11 @@ class GenerationConfig(PushToHubMixin):
155
158
 
156
159
  > Parameters for manipulation of the model output logits
157
160
 
158
- temperature (`float`, *optional*, defaults to 1.0):
161
+ temperature (`float`, *optional*):
159
162
  The value used to module the next token probabilities. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 1.0
160
- top_k (`int`, *optional*, defaults to 50):
163
+ top_k (`int`, *optional*):
161
164
  The number of highest probability vocabulary tokens to keep for top-k-filtering. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 50.
162
- top_p (`float`, *optional*, defaults to 1.0):
165
+ top_p (`float`, *optional*):
163
166
  If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to
164
167
  `top_p` or higher are kept for generation. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 1.0
165
168
  min_p (`float`, *optional*):
@@ -172,41 +175,41 @@ class GenerationConfig(PushToHubMixin):
172
175
  is kept whose *renormalized* entropy is less than or equal to `top_h` times the entropy of the full distribution.
173
176
  Smaller values (e.g., 0.2–0.5) lead to more focused, deterministic outputs, while values closer to 1.0 allow more
174
177
  randomness and diversity. Typical values are in the 0.3–0.6 range.
175
- typical_p (`float`, *optional*, defaults to 1.0):
178
+ typical_p (`float`, *optional*):
176
179
  Local typicality measures how similar the conditional probability of predicting a target token next is to
177
180
  the expected conditional probability of predicting a random token next, given the partial text already
178
181
  generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that
179
182
  add up to `typical_p` or higher are kept for generation. See [this
180
183
  paper](https://huggingface.co/papers/2202.00666) for more details.
181
- epsilon_cutoff (`float`, *optional*, defaults to 0.0):
184
+ epsilon_cutoff (`float`, *optional*):
182
185
  If set to float strictly between 0 and 1, only tokens with a conditional probability greater than
183
186
  `epsilon_cutoff` will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the
184
187
  size of the model. See [Truncation Sampling as Language Model
185
188
  Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
186
- eta_cutoff (`float`, *optional*, defaults to 0.0):
189
+ eta_cutoff (`float`, *optional*):
187
190
  Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between
188
191
  0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) *
189
192
  exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token
190
193
  probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3,
191
194
  depending on the size of the model. See [Truncation Sampling as Language Model
192
195
  Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
193
- repetition_penalty (`float`, *optional*, defaults to 1.0):
196
+ repetition_penalty (`float`, *optional*):
194
197
  The parameter for repetition penalty. 1.0 means no penalty. See [this
195
198
  paper](https://huggingface.co/papers/1909.05858) for more details.
196
- encoder_repetition_penalty (`float`, *optional*, defaults to 1.0):
199
+ encoder_repetition_penalty (`float`, *optional*):
197
200
  The parameter for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
198
201
  original input. 1.0 means no penalty.
199
- length_penalty (`float`, *optional*, defaults to 1.0):
202
+ length_penalty (`float`, *optional*):
200
203
  Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
201
204
  the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
202
205
  likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
203
206
  `length_penalty` < 0.0 encourages shorter sequences.
204
- no_repeat_ngram_size (`int`, *optional*, defaults to 0):
207
+ no_repeat_ngram_size (`int`, *optional*):
205
208
  If set to int > 0, all ngrams of that size can only occur once.
206
209
  bad_words_ids (`list[list[int]]`, *optional*):
207
210
  List of list of token ids that are not allowed to be generated. Check
208
211
  [`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples.
209
- renormalize_logits (`bool`, *optional*, defaults to `False`):
212
+ renormalize_logits (`bool`):
210
213
  Whether to renormalize the logits after applying all the logits processors (including the custom
211
214
  ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the score logits
212
215
  are normalized but some logit processors break the normalization.
@@ -217,7 +220,7 @@ class GenerationConfig(PushToHubMixin):
217
220
  forced_eos_token_id (`int` or list[int]`, *optional*, defaults to `model.config.forced_eos_token_id`):
218
221
  The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a
219
222
  list to set multiple *end-of-sequence* tokens.
220
- remove_invalid_values (`bool`, *optional*, defaults to `model.config.remove_invalid_values`):
223
+ remove_invalid_values (`bool`):
221
224
  Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash.
222
225
  Note that using `remove_invalid_values` can slow down generation.
223
226
  exponential_decay_length_penalty (`tuple(int, float)`, *optional*):
@@ -234,7 +237,7 @@ class GenerationConfig(PushToHubMixin):
234
237
  Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
235
238
  sequence being selected, while negative biases do the opposite. Check
236
239
  [`~generation.SequenceBiasLogitsProcessor`] for further documentation and examples.
237
- token_healing (`bool`, *optional*, defaults to `False`):
240
+ token_healing (`bool`):
238
241
  Heal tail tokens of prompts by replacing them with their appropriate extensions.
239
242
  This enhances the quality of completions for prompts affected by greedy tokenization bias.
240
243
  guidance_scale (`float`, *optional*):
@@ -248,20 +251,20 @@ class GenerationConfig(PushToHubMixin):
248
251
 
249
252
  > Parameters that define the output variables of generate
250
253
 
251
- num_return_sequences (`int`, *optional*, defaults to 1):
254
+ num_return_sequences (`int`, *optional*):
252
255
  The number of independently computed returned sequences for each element in the batch.
253
- output_attentions (`bool`, *optional*, defaults to `False`):
256
+ output_attentions (`bool`):
254
257
  Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
255
258
  tensors for more details.
256
- output_hidden_states (`bool`, *optional*, defaults to `False`):
259
+ output_hidden_states (`bool`):
257
260
  Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
258
261
  more details.
259
- output_scores (`bool`, *optional*, defaults to `False`):
262
+ output_scores (`bool`):
260
263
  Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
261
- output_logits (`bool`, *optional*):
264
+ output_logits (`bool`):
262
265
  Whether or not to return the unprocessed prediction logit scores. See `logits` under returned tensors for
263
266
  more details.
264
- return_dict_in_generate (`bool`, *optional*, defaults to `False`):
267
+ return_dict_in_generate (`bool`):
265
268
  Whether or not to return a [`~utils.ModelOutput`], as opposed to returning exclusively the generated
266
269
  sequence. This flag must be set to `True` to return the generation cache (when `use_cache` is `True`)
267
270
  or optional outputs (see flags starting with `output_`)
@@ -277,7 +280,7 @@ class GenerationConfig(PushToHubMixin):
277
280
 
278
281
  > Generation parameters exclusive to encoder-decoder models
279
282
 
280
- encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
283
+ encoder_no_repeat_ngram_size (`int`, *optional*):
281
284
  If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
282
285
  `decoder_input_ids`.
283
286
  decoder_start_token_id (`int` or `list[int]`, *optional*):
@@ -286,20 +289,20 @@ class GenerationConfig(PushToHubMixin):
286
289
  (e.g. multilingual models with different target languages in one batch)
287
290
 
288
291
  > Generation parameters exclusive to assistant generation
289
- is_assistant (`bool`, *optional*, defaults to `False`):
292
+ is_assistant (`bool`):
290
293
  Whether the model is an assistant (draft) model.
291
- num_assistant_tokens (`int`, *optional*, defaults to 20):
294
+ num_assistant_tokens (`int`, *optional*):
292
295
  Defines the number of _speculative tokens_ that shall be generated by the assistant model before being
293
296
  checked by the target model at each iteration. Higher values for `num_assistant_tokens` make the generation
294
297
  more _speculative_ : If the assistant model is performant larger speed-ups can be reached, if the assistant
295
298
  model requires lots of corrections, lower speed-ups are reached.
296
- num_assistant_tokens_schedule (`str`, *optional*, defaults to `"constant"`):
299
+ num_assistant_tokens_schedule (`str`, *optional*):
297
300
  Defines the schedule at which max assistant tokens shall be changed during inference.
298
301
  - `"heuristic"`: When all speculative tokens are correct, increase `num_assistant_tokens` by 2 else
299
302
  reduce by 1. `num_assistant_tokens` value is persistent over multiple generation calls with the same assistant model.
300
303
  - `"heuristic_transient"`: Same as `"heuristic"` but `num_assistant_tokens` is reset to its initial value after each generation call.
301
304
  - `"constant"`: `num_assistant_tokens` stays unchanged during generation
302
- assistant_confidence_threshold (`float`, *optional*, defaults to 0.4):
305
+ assistant_confidence_threshold (`float`, *optional*):
303
306
  The confidence threshold for the assistant model. If the assistant model's confidence in its prediction for the current token is lower
304
307
  than this threshold, the assistant model stops the current token generation iteration, even if the number of _speculative tokens_
305
308
  (defined by `num_assistant_tokens`) is not yet reached. The assistant's confidence threshold is adjusted throughout the speculative iterations to reduce the number of unnecessary draft and target forward passes, biased towards avoiding false negatives.
@@ -313,11 +316,11 @@ class GenerationConfig(PushToHubMixin):
313
316
  assistant_early_exit(`int`, *optional*):
314
317
  If set to a positive integer, early exit of the model will be used as an assistant. Can only be used with
315
318
  models that support early exit (i.e. models where logits from intermediate layers can be interpreted by the LM head).
316
- assistant_lookbehind(`int`, *optional*, defaults to 10):
319
+ assistant_lookbehind(`int`, *optional*):
317
320
  If set to a positive integer, the re-encodeing process will additionally consider the last `assistant_lookbehind` assistant tokens
318
321
  to correctly align tokens. Can only be used with different tokenizers in speculative decoding.
319
322
  See this [blog](https://huggingface.co/blog/universal_assisted_generation) for more details.
320
- target_lookbehind(`int`, *optional*, defaults to 10):
323
+ target_lookbehind(`int`, *optional*):
321
324
  If set to a positive integer, the re-encodeing process will additionally consider the last `target_lookbehind` target tokens
322
325
  to correctly align tokens. Can only be used with different tokenizers in speculative decoding.
323
326
  See this [blog](https://huggingface.co/blog/universal_assisted_generation) for more details.
@@ -327,7 +330,7 @@ class GenerationConfig(PushToHubMixin):
327
330
  compile_config (CompileConfig, *optional*):
328
331
  If using a compilable cache, this controls how `generate` will `compile` the forward pass for faster
329
332
  inference.
330
- disable_compile (`bool`, *optional*):
333
+ disable_compile (`bool`):
331
334
  Whether to disable the automatic compilation of the forward pass. Automatic compilation happens when
332
335
  specific criteria are met, including using a compilable cache. Please open an issue if you find the
333
336
  need to use this flag.
@@ -337,65 +340,59 @@ class GenerationConfig(PushToHubMixin):
337
340
 
338
341
  def __init__(self, **kwargs):
339
342
  # Parameters that control the length of the output
340
- self.max_length = kwargs.pop("max_length", 20)
343
+ self.max_length = kwargs.pop("max_length", None)
341
344
  self.max_new_tokens = kwargs.pop("max_new_tokens", None)
342
- self.min_length = kwargs.pop("min_length", 0)
345
+ self.min_length = kwargs.pop("min_length", None)
343
346
  self.min_new_tokens = kwargs.pop("min_new_tokens", None)
344
- self.early_stopping = kwargs.pop("early_stopping", False)
347
+ self.early_stopping = kwargs.pop("early_stopping", None)
345
348
  self.max_time = kwargs.pop("max_time", None)
346
349
  self.stop_strings = kwargs.pop("stop_strings", None)
347
350
 
348
351
  # Parameters that control the generation strategy used
349
- self.do_sample = kwargs.pop("do_sample", False)
350
- self.num_beams = kwargs.pop("num_beams", 1)
352
+ self.do_sample = kwargs.pop("do_sample", None)
353
+ self.num_beams = kwargs.pop("num_beams", None)
351
354
 
352
355
  # Parameters that control the cache
353
- self.use_cache = kwargs.pop("use_cache", True)
356
+ self.use_cache = kwargs.pop("use_cache", None)
354
357
  self.cache_implementation = kwargs.pop("cache_implementation", None)
355
358
  self.cache_config = kwargs.pop("cache_config", None)
356
359
 
357
- self.prefill_chunk_size = kwargs.pop("prefill_chunk_size", None)
358
-
359
360
  # Parameters for manipulation of the model output logits
360
- self.temperature = kwargs.pop("temperature", 1.0)
361
- self.top_k = kwargs.pop("top_k", 50)
362
- self.top_p = kwargs.pop("top_p", 1.0)
361
+ self.temperature = kwargs.pop("temperature", None)
362
+ self.top_k = kwargs.pop("top_k", None)
363
+ self.top_p = kwargs.pop("top_p", None)
363
364
  self.min_p = kwargs.pop("min_p", None)
364
365
  self.top_h = kwargs.pop("top_h", None)
365
- self.typical_p = kwargs.pop("typical_p", 1.0)
366
- self.epsilon_cutoff = kwargs.pop("epsilon_cutoff", 0.0)
367
- self.eta_cutoff = kwargs.pop("eta_cutoff", 0.0)
368
- self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
369
- self.encoder_repetition_penalty = kwargs.pop("encoder_repetition_penalty", 1.0)
370
- self.length_penalty = kwargs.pop("length_penalty", 1.0)
371
- self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
366
+ self.typical_p = kwargs.pop("typical_p", None)
367
+ self.epsilon_cutoff = kwargs.pop("epsilon_cutoff", None)
368
+ self.eta_cutoff = kwargs.pop("eta_cutoff", None)
369
+ self.repetition_penalty = kwargs.pop("repetition_penalty", None)
370
+ self.encoder_repetition_penalty = kwargs.pop("encoder_repetition_penalty", None)
371
+ self.length_penalty = kwargs.pop("length_penalty", None)
372
+ self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", None)
372
373
  self.bad_words_ids = kwargs.pop("bad_words_ids", None)
373
- self.renormalize_logits = kwargs.pop("renormalize_logits", False)
374
+ self.renormalize_logits = kwargs.pop("renormalize_logits", None)
374
375
  self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None)
375
376
  self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
376
- self.remove_invalid_values = kwargs.pop("remove_invalid_values", False)
377
+ self.remove_invalid_values = kwargs.pop("remove_invalid_values", None)
377
378
  self.exponential_decay_length_penalty = kwargs.pop("exponential_decay_length_penalty", None)
378
379
  self.suppress_tokens = kwargs.pop("suppress_tokens", None)
379
380
  self.begin_suppress_tokens = kwargs.pop("begin_suppress_tokens", None)
380
381
  self.sequence_bias = kwargs.pop("sequence_bias", None)
381
- self.token_healing = kwargs.pop("token_healing", False)
382
+ self.token_healing = kwargs.pop("token_healing", None)
382
383
  self.guidance_scale = kwargs.pop("guidance_scale", None)
383
384
 
384
- watermarking_config = kwargs.pop("watermarking_config", None)
385
- if watermarking_config is None:
386
- self.watermarking_config = None
387
- elif isinstance(watermarking_config, BaseWatermarkingConfig):
388
- self.watermarking_config = watermarking_config
389
- else:
390
- self.watermarking_config = WatermarkingConfig.from_dict(watermarking_config)
385
+ self.watermarking_config = kwargs.pop("watermarking_config", None)
386
+ if isinstance(self.watermarking_config, dict):
387
+ self.watermarking_config = WatermarkingConfig.from_dict(self.watermarking_config)
391
388
 
392
389
  # Parameters that define the output variables of `generate`
393
- self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
394
- self.output_attentions = kwargs.pop("output_attentions", False)
395
- self.output_hidden_states = kwargs.pop("output_hidden_states", False)
396
- self.output_scores = kwargs.pop("output_scores", False)
390
+ self.num_return_sequences = kwargs.pop("num_return_sequences", None)
391
+ self.output_attentions = kwargs.pop("output_attentions", None)
392
+ self.output_hidden_states = kwargs.pop("output_hidden_states", None)
393
+ self.output_scores = kwargs.pop("output_scores", None)
397
394
  self.output_logits = kwargs.pop("output_logits", None)
398
- self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", False)
395
+ self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", None)
399
396
 
400
397
  # Special tokens that can be used at generation time
401
398
  self.pad_token_id = kwargs.pop("pad_token_id", None)
@@ -403,57 +400,57 @@ class GenerationConfig(PushToHubMixin):
403
400
  self.eos_token_id = kwargs.pop("eos_token_id", None)
404
401
 
405
402
  # Generation parameters exclusive to encoder-decoder models
406
- self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", 0)
403
+ self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", None)
407
404
  self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
408
405
 
409
406
  # Assistant generation
410
- self.is_assistant = False
411
- self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", 20)
412
- self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", "constant")
413
- self.assistant_confidence_threshold = kwargs.pop("assistant_confidence_threshold", 0.4)
407
+ self.is_assistant = kwargs.pop("is_assistant", None)
408
+ self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", None)
409
+ self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", None)
410
+ self.assistant_confidence_threshold = kwargs.pop("assistant_confidence_threshold", None)
414
411
  self.prompt_lookup_num_tokens = kwargs.pop("prompt_lookup_num_tokens", None)
415
412
  self.max_matching_ngram_size = kwargs.pop("max_matching_ngram_size", None)
416
413
  self.assistant_early_exit = kwargs.pop("assistant_early_exit", None)
417
- ## assistant generation for different tokenizers, the windows size for assistant/target model
418
- self.assistant_lookbehind = kwargs.pop("assistant_lookbehind", 10)
419
- self.target_lookbehind = kwargs.pop("target_lookbehind", 10)
414
+ self.assistant_lookbehind = kwargs.pop("assistant_lookbehind", None)
415
+ self.target_lookbehind = kwargs.pop("target_lookbehind", None)
420
416
 
421
417
  # Performance
422
418
  self.compile_config = kwargs.pop("compile_config", None)
423
- self.disable_compile = kwargs.pop("disable_compile", False)
419
+ self.disable_compile = kwargs.pop("disable_compile", None)
424
420
 
425
- # Deprecated (moved to the Hub). TODO joao, manuel: remove in v4.62.0
421
+ # Deprecated (moved to the Hub). TODO remove for v5
426
422
  self.low_memory = kwargs.pop("low_memory", None)
427
423
  self.penalty_alpha = kwargs.pop("penalty_alpha", None)
428
424
  self.dola_layers = kwargs.pop("dola_layers", None)
429
- self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
430
- self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
425
+ self.diversity_penalty = kwargs.pop("diversity_penalty", None)
426
+ self.num_beam_groups = kwargs.pop("num_beam_groups", None)
431
427
  self.constraints = kwargs.pop("constraints", None)
432
428
  self.force_words_ids = kwargs.pop("force_words_ids", None)
433
429
 
434
- # The remaining attributes do not parametrize `.generate()`, but are informative and/or used by the hub
435
- # interface.
436
- self._from_model_config = kwargs.pop("_from_model_config", False)
437
- self._commit_hash = kwargs.pop("_commit_hash", None)
438
- self.transformers_version = kwargs.pop("transformers_version", __version__)
430
+ self.prefill_chunk_size = kwargs.pop("prefill_chunk_size", None)
439
431
 
440
- # Ensure backward compatibility for models that use `forced_bos_token_id` within their config
441
- if self._from_model_config and kwargs.get("force_bos_token_to_be_generated", False):
442
- self.forced_bos_token_id = self.bos_token_id
443
- logger.warning_once(
444
- f"Please make sure the generation config includes `forced_bos_token_id={self.bos_token_id}`. "
445
- )
432
+ # Common attributes
433
+ self._commit_hash = kwargs.pop("_commit_hash", None)
434
+ self._from_model_config = kwargs.pop("_from_model_config", None)
435
+ self.transformers_version = kwargs.pop("transformers_version", None)
446
436
 
447
437
  # Additional attributes without default values
448
438
  if not self._from_model_config:
449
- # we don't want to copy values from the model config if we're initializing a `GenerationConfig` from a
450
- # model's default configuration file
439
+ # we don't want to copy values from the model config if we're initializing
440
+ # a `GenerationConfig` from a model's default configuration file
451
441
  for key, value in kwargs.items():
452
442
  try:
453
443
  setattr(self, key, value)
454
444
  except AttributeError as err:
455
445
  logger.error(f"Can't set {key} with value {value} for {self}")
456
446
  raise err
447
+ else:
448
+ # Ensure backward compatibility for models that use `forced_bos_token_id` within their config
449
+ if kwargs.get("force_bos_token_to_be_generated", False):
450
+ self.forced_bos_token_id = self.bos_token_id
451
+ logger.warning_once(
452
+ f"Please make sure the generation config includes `forced_bos_token_id={self.bos_token_id}`. "
453
+ )
457
454
 
458
455
  # Validate the values of the attributes
459
456
  self.validate()
@@ -488,8 +485,8 @@ class GenerationConfig(PushToHubMixin):
488
485
  # property and part of the `__repr__`
489
486
  if self.constraints is not None or self.force_words_ids is not None:
490
487
  generation_mode = GenerationMode.CONSTRAINED_BEAM_SEARCH
491
- elif self.num_beams == 1:
492
- if self.do_sample is False:
488
+ elif self.num_beams is None or self.num_beams == 1:
489
+ if self.do_sample is not True:
493
490
  if (
494
491
  self.top_k is not None
495
492
  and self.top_k > 1
@@ -502,7 +499,7 @@ class GenerationConfig(PushToHubMixin):
502
499
  else:
503
500
  generation_mode = GenerationMode.SAMPLE
504
501
  else:
505
- if self.num_beam_groups > 1:
502
+ if self.num_beam_groups is not None and self.num_beam_groups > 1:
506
503
  generation_mode = GenerationMode.GROUP_BEAM_SEARCH
507
504
  elif self.do_sample is True:
508
505
  generation_mode = GenerationMode.BEAM_SAMPLE
@@ -537,6 +534,46 @@ class GenerationConfig(PushToHubMixin):
537
534
  )
538
535
  return generation_mode
539
536
 
537
+ @staticmethod
538
+ def _get_default_generation_params() -> dict[str, Any]:
539
+ return {
540
+ "max_length": 20,
541
+ "min_length": 0,
542
+ "do_sample": False,
543
+ "use_cache": True,
544
+ "early_stopping": False,
545
+ "num_beams": 1,
546
+ "temperature": 1.0,
547
+ "top_k": 50,
548
+ "top_p": 1.0,
549
+ "typical_p": 1.0,
550
+ "repetition_penalty": 1.0,
551
+ "length_penalty": 1.0,
552
+ "no_repeat_ngram_size": 0,
553
+ "encoder_no_repeat_ngram_size": 0,
554
+ "bad_words_ids": None,
555
+ "num_return_sequences": 1,
556
+ "output_scores": False,
557
+ "return_dict_in_generate": False,
558
+ "forced_bos_token_id": None,
559
+ "forced_eos_token_id": None,
560
+ "remove_invalid_values": False,
561
+ "exponential_decay_length_penalty": None,
562
+ "suppress_tokens": None,
563
+ "begin_suppress_tokens": None,
564
+ "epsilon_cutoff": 0.0,
565
+ "eta_cutoff": 0.0,
566
+ "encoder_repetition_penalty": 1.0,
567
+ "num_assistant_tokens": 20,
568
+ "num_assistant_tokens_schedule": "constant",
569
+ "assistant_confidence_threshold": 0.4,
570
+ "assistant_lookbehind": 10,
571
+ "target_lookbehind": 10,
572
+ # Deprecated arguments (moved to the Hub). TODO joao, manuel: remove in v4.62.0
573
+ "num_beam_groups": 1,
574
+ "diversity_penalty": 0.0,
575
+ }
576
+
540
577
  def validate(self, strict=False):
541
578
  """
542
579
  Validates the values of the attributes of the [`GenerationConfig`] instance. Raises exceptions in the presence
@@ -552,7 +589,7 @@ class GenerationConfig(PushToHubMixin):
552
589
 
553
590
  # 1. Validation of individual attributes
554
591
  # 1.1. Decoding attributes
555
- if self.early_stopping not in {True, False, "never"}:
592
+ if self.early_stopping not in {None, True, False, "never"}:
556
593
  raise ValueError(f"`early_stopping` must be a boolean or 'never', but is {self.early_stopping}.")
557
594
  if self.max_new_tokens is not None and self.max_new_tokens <= 0:
558
595
  raise ValueError(f"`max_new_tokens` must be greater than 0, but is {self.max_new_tokens}.")
@@ -583,9 +620,12 @@ class GenerationConfig(PushToHubMixin):
583
620
 
584
621
  # 2. Validation of attribute combinations
585
622
  # 2.1. detect sampling-only parameterization when not in sampling mode
586
- if self.do_sample is False:
623
+
624
+ # Note that we check `is not True` in purpose. Boolean fields can also be `None` so we
625
+ # have to be explicit. Value of `None` is same as having `False`, i.e. the default value
626
+ if self.do_sample is not True:
587
627
  greedy_wrong_parameter_msg = (
588
- "`do_sample` is set to `False`. However, `{flag_name}` is set to `{flag_value}` -- this flag is only "
628
+ "`do_sample` is set not to set `True`. However, `{flag_name}` is set to `{flag_value}` -- this flag is only "
589
629
  "used in sample-based generation modes. You should set `do_sample=True` or unset `{flag_name}`."
590
630
  )
591
631
  if self.temperature is not None and self.temperature != 1.0:
@@ -614,29 +654,33 @@ class GenerationConfig(PushToHubMixin):
614
654
  )
615
655
 
616
656
  # 2.2. detect beam-only parameterization when not in beam mode
617
- if self.num_beams == 1:
657
+ if self.num_beams is None or self.num_beams == 1:
618
658
  single_beam_wrong_parameter_msg = (
619
- "`num_beams` is set to 1. However, `{flag_name}` is set to `{flag_value}` -- this flag is only used "
659
+ "`num_beams` is set to {num_beams}. However, `{flag_name}` is set to `{flag_value}` -- this flag is only used "
620
660
  "in beam-based generation modes. You should set `num_beams>1` or unset `{flag_name}`."
621
661
  )
622
- if self.early_stopping is not False:
662
+ if self.early_stopping is not None and self.early_stopping is not False:
623
663
  minor_issues["early_stopping"] = single_beam_wrong_parameter_msg.format(
624
- flag_name="early_stopping", flag_value=self.early_stopping
664
+ num_beams=self.num_beams, flag_name="early_stopping", flag_value=self.early_stopping
625
665
  )
626
666
  if self.length_penalty is not None and self.length_penalty != 1.0:
627
667
  minor_issues["length_penalty"] = single_beam_wrong_parameter_msg.format(
628
- flag_name="length_penalty", flag_value=self.length_penalty
668
+ num_beams=self.num_beams, flag_name="length_penalty", flag_value=self.length_penalty
629
669
  )
630
670
 
631
671
  # 2.4. check `num_return_sequences`
632
- if self.num_return_sequences != 1:
633
- if self.num_beams == 1:
634
- if self.do_sample is False:
672
+ if self.num_return_sequences is not None and self.num_return_sequences > 1:
673
+ if self.num_beams is None or self.num_beams == 1:
674
+ if not self.do_sample:
635
675
  raise ValueError(
636
- "Greedy methods without beam search do not support `num_return_sequences` different than 1 "
637
- f"(got {self.num_return_sequences})."
676
+ "Greedy methods (do_sample != True) without beam search do not support "
677
+ f"`num_return_sequences` different than 1 (got {self.num_return_sequences})."
638
678
  )
639
- elif self.num_return_sequences > self.num_beams:
679
+ elif (
680
+ self.num_beams is not None
681
+ and self.num_return_sequences is not None
682
+ and self.num_return_sequences > self.num_beams
683
+ ):
640
684
  raise ValueError(
641
685
  f"`num_return_sequences` ({self.num_return_sequences}) has to be smaller or equal to `num_beams` "
642
686
  f"({self.num_beams})."
@@ -648,8 +692,8 @@ class GenerationConfig(PushToHubMixin):
648
692
  # passed to `generate` directly to hot-fix cache issues, let's raise a warning instead of an error
649
693
  # (otherwise a user might need to overwrite several parameters).
650
694
  no_cache_warning = (
651
- "You have set `use_cache` to `False`, but {cache_arg} is set to {cache_arg_value}. {cache_arg} will "
652
- "have no effect."
695
+ "You have not set `use_cache` to `True`, but {cache_arg} is set to {cache_arg_value}."
696
+ "{cache_arg} will have no effect."
653
697
  )
654
698
  for arg_name in ("cache_implementation", "cache_config"):
655
699
  if getattr(self, arg_name) is not None:
@@ -676,7 +720,6 @@ class GenerationConfig(PushToHubMixin):
676
720
  "streamer",
677
721
  "negative_prompt_ids",
678
722
  "negative_prompt_attention_mask",
679
- "use_model_defaults",
680
723
  )
681
724
  for arg in generate_arguments:
682
725
  if hasattr(self, arg):
@@ -1101,7 +1144,7 @@ class GenerationConfig(PushToHubMixin):
1101
1144
  writer.write(self.to_json_string(use_diff=use_diff, keys_to_pop=keys_to_pop))
1102
1145
 
1103
1146
  @classmethod
1104
- def from_model_config(cls, model_config: PreTrainedConfig | dict) -> "GenerationConfig":
1147
+ def from_model_config(cls, model_config: Union["PreTrainedConfig", dict]) -> "GenerationConfig":
1105
1148
  """
1106
1149
  Instantiates a [`GenerationConfig`] from a [`PreTrainedConfig`]. This function is useful to convert legacy
1107
1150
  [`PreTrainedConfig`] objects, which may contain generation parameters, into a stand-alone [`GenerationConfig`].
@@ -1118,23 +1161,28 @@ class GenerationConfig(PushToHubMixin):
1118
1161
 
1119
1162
  # Removes all `None` from the model config dict -- this lets the generation config defaults to take hold
1120
1163
  config_dict = {key: value for key, value in config_dict.items() if value is not None}
1121
-
1122
1164
  generation_config = cls.from_dict(config_dict, return_unused_kwargs=False, _from_model_config=True)
1123
1165
 
1124
1166
  # Special case: some models have generation attributes set in the decoder. Use them if still unset in the
1125
1167
  # generation config (which in turn is defined from the outer attributes of model config).
1126
- if not isinstance(model_config, dict):
1127
- decoder_config = model_config.get_text_config(decoder=True)
1128
- if decoder_config is not model_config:
1129
- default_generation_config = GenerationConfig()
1130
- decoder_config_dict = decoder_config.to_dict()
1131
- for attr in generation_config.to_dict():
1132
- is_unset = getattr(generation_config, attr) == getattr(default_generation_config, attr)
1133
- if attr in decoder_config_dict and is_unset:
1134
- setattr(generation_config, attr, decoder_config_dict[attr])
1168
+ if isinstance(model_config, dict):
1169
+ decoder_possible_text_config_names = ("decoder", "generator", "text_config")
1170
+ for text_config_name in decoder_possible_text_config_names:
1171
+ if text_config := model_config.get(text_config_name):
1172
+ model_config = text_config
1173
+ break
1174
+ else:
1175
+ model_config = model_config.get_text_config(decoder=True)
1176
+ model_config = model_config.to_dict()
1177
+
1178
+ default_generation_config = GenerationConfig()
1179
+ for attr in generation_config.to_dict():
1180
+ is_unset = getattr(generation_config, attr) == getattr(default_generation_config, attr)
1181
+ if attr in model_config and is_unset:
1182
+ setattr(generation_config, attr, model_config[attr])
1135
1183
 
1136
1184
  # If any `output_...` flag is set to `True`, we ensure `return_dict_in_generate` is set to `True`.
1137
- if generation_config.return_dict_in_generate is False:
1185
+ if not generation_config.return_dict_in_generate:
1138
1186
  if any(
1139
1187
  getattr(generation_config, extra_output_flag, False)
1140
1188
  for extra_output_flag in generation_config.extra_output_flags
@@ -1145,12 +1193,16 @@ class GenerationConfig(PushToHubMixin):
1145
1193
  generation_config._original_object_hash = hash(generation_config)
1146
1194
  return generation_config
1147
1195
 
1148
- def update(self, **kwargs):
1196
+ def update(self, defaults_only=False, allow_custom_entries=False, **kwargs):
1149
1197
  """
1150
1198
  Updates attributes of this class instance with attributes from `kwargs` if they match existing attributes,
1151
1199
  returning all the unused kwargs.
1152
1200
 
1153
1201
  Args:
1202
+ defaults_only (`bool`, *optional*, defaults to `False`):
1203
+ Whether to update all keys in config with `kwargs` or only those that are set to `None` (i.e. default value).
1204
+ allow_custom_entries (`bool`, *optional*, defaults to `False`):
1205
+ Whether to allow updating custom entries into the config with `kwargs` if not present in the current config.
1154
1206
  kwargs (`dict[str, Any]`):
1155
1207
  Dictionary of attributes to tentatively update this class.
1156
1208
 
@@ -1159,9 +1211,13 @@ class GenerationConfig(PushToHubMixin):
1159
1211
  """
1160
1212
  to_remove = []
1161
1213
  for key, value in kwargs.items():
1162
- if hasattr(self, key):
1214
+ if allow_custom_entries and not hasattr(self, key):
1163
1215
  setattr(self, key, value)
1164
1216
  to_remove.append(key)
1217
+ elif hasattr(self, key):
1218
+ if not defaults_only or getattr(self, key) is None:
1219
+ setattr(self, key, value)
1220
+ to_remove.append(key)
1165
1221
 
1166
1222
  # Confirm that the updated instance is still valid
1167
1223
  self.validate()
@@ -1221,8 +1277,7 @@ class BaseWatermarkingConfig(ABC):
1221
1277
  return output
1222
1278
 
1223
1279
  def __iter__(self):
1224
- for attr, value in copy.deepcopy(self.__dict__).items():
1225
- yield attr, value
1280
+ yield from copy.deepcopy(self.__dict__).items()
1226
1281
 
1227
1282
  def __repr__(self):
1228
1283
  return f"{self.__class__.__name__} {self.to_json_string()}"