transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1021) hide show
  1. transformers/__init__.py +4 -11
  2. transformers/activations.py +2 -2
  3. transformers/backbone_utils.py +326 -0
  4. transformers/cache_utils.py +11 -2
  5. transformers/cli/serve.py +11 -8
  6. transformers/configuration_utils.py +1 -69
  7. transformers/conversion_mapping.py +146 -26
  8. transformers/convert_slow_tokenizer.py +6 -4
  9. transformers/core_model_loading.py +207 -118
  10. transformers/dependency_versions_check.py +0 -1
  11. transformers/dependency_versions_table.py +7 -8
  12. transformers/file_utils.py +0 -2
  13. transformers/generation/candidate_generator.py +1 -2
  14. transformers/generation/continuous_batching/cache.py +40 -38
  15. transformers/generation/continuous_batching/cache_manager.py +3 -16
  16. transformers/generation/continuous_batching/continuous_api.py +94 -406
  17. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  18. transformers/generation/continuous_batching/requests.py +54 -17
  19. transformers/generation/continuous_batching/scheduler.py +77 -95
  20. transformers/generation/logits_process.py +10 -5
  21. transformers/generation/stopping_criteria.py +1 -2
  22. transformers/generation/utils.py +75 -95
  23. transformers/image_processing_utils.py +0 -3
  24. transformers/image_processing_utils_fast.py +17 -18
  25. transformers/image_transforms.py +44 -13
  26. transformers/image_utils.py +0 -5
  27. transformers/initialization.py +57 -0
  28. transformers/integrations/__init__.py +10 -24
  29. transformers/integrations/accelerate.py +47 -11
  30. transformers/integrations/deepspeed.py +145 -3
  31. transformers/integrations/executorch.py +2 -6
  32. transformers/integrations/finegrained_fp8.py +142 -7
  33. transformers/integrations/flash_attention.py +2 -7
  34. transformers/integrations/hub_kernels.py +18 -7
  35. transformers/integrations/moe.py +226 -106
  36. transformers/integrations/mxfp4.py +47 -34
  37. transformers/integrations/peft.py +488 -176
  38. transformers/integrations/tensor_parallel.py +641 -581
  39. transformers/masking_utils.py +153 -9
  40. transformers/modeling_flash_attention_utils.py +1 -2
  41. transformers/modeling_utils.py +359 -358
  42. transformers/models/__init__.py +6 -0
  43. transformers/models/afmoe/configuration_afmoe.py +14 -4
  44. transformers/models/afmoe/modeling_afmoe.py +8 -8
  45. transformers/models/afmoe/modular_afmoe.py +7 -7
  46. transformers/models/aimv2/configuration_aimv2.py +2 -7
  47. transformers/models/aimv2/modeling_aimv2.py +26 -24
  48. transformers/models/aimv2/modular_aimv2.py +8 -12
  49. transformers/models/albert/configuration_albert.py +8 -1
  50. transformers/models/albert/modeling_albert.py +3 -3
  51. transformers/models/align/configuration_align.py +8 -5
  52. transformers/models/align/modeling_align.py +22 -24
  53. transformers/models/altclip/configuration_altclip.py +4 -6
  54. transformers/models/altclip/modeling_altclip.py +30 -26
  55. transformers/models/apertus/configuration_apertus.py +5 -7
  56. transformers/models/apertus/modeling_apertus.py +4 -4
  57. transformers/models/apertus/modular_apertus.py +8 -10
  58. transformers/models/arcee/configuration_arcee.py +5 -7
  59. transformers/models/arcee/modeling_arcee.py +4 -4
  60. transformers/models/aria/configuration_aria.py +11 -21
  61. transformers/models/aria/modeling_aria.py +39 -36
  62. transformers/models/aria/modular_aria.py +33 -39
  63. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
  64. transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
  65. transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
  66. transformers/models/auto/auto_factory.py +8 -6
  67. transformers/models/auto/configuration_auto.py +22 -0
  68. transformers/models/auto/image_processing_auto.py +17 -13
  69. transformers/models/auto/modeling_auto.py +15 -0
  70. transformers/models/auto/processing_auto.py +9 -18
  71. transformers/models/auto/tokenization_auto.py +17 -15
  72. transformers/models/autoformer/modeling_autoformer.py +2 -1
  73. transformers/models/aya_vision/configuration_aya_vision.py +4 -0
  74. transformers/models/aya_vision/modeling_aya_vision.py +29 -62
  75. transformers/models/aya_vision/modular_aya_vision.py +20 -45
  76. transformers/models/bamba/configuration_bamba.py +17 -7
  77. transformers/models/bamba/modeling_bamba.py +23 -55
  78. transformers/models/bamba/modular_bamba.py +19 -54
  79. transformers/models/bark/configuration_bark.py +2 -1
  80. transformers/models/bark/modeling_bark.py +24 -10
  81. transformers/models/bart/configuration_bart.py +9 -4
  82. transformers/models/bart/modeling_bart.py +9 -12
  83. transformers/models/beit/configuration_beit.py +2 -4
  84. transformers/models/beit/image_processing_beit_fast.py +3 -3
  85. transformers/models/beit/modeling_beit.py +14 -9
  86. transformers/models/bert/configuration_bert.py +12 -1
  87. transformers/models/bert/modeling_bert.py +6 -30
  88. transformers/models/bert_generation/configuration_bert_generation.py +17 -1
  89. transformers/models/bert_generation/modeling_bert_generation.py +6 -6
  90. transformers/models/big_bird/configuration_big_bird.py +12 -8
  91. transformers/models/big_bird/modeling_big_bird.py +0 -15
  92. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
  93. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
  94. transformers/models/biogpt/configuration_biogpt.py +8 -1
  95. transformers/models/biogpt/modeling_biogpt.py +4 -8
  96. transformers/models/biogpt/modular_biogpt.py +1 -5
  97. transformers/models/bit/configuration_bit.py +2 -4
  98. transformers/models/bit/modeling_bit.py +6 -5
  99. transformers/models/bitnet/configuration_bitnet.py +5 -7
  100. transformers/models/bitnet/modeling_bitnet.py +3 -4
  101. transformers/models/bitnet/modular_bitnet.py +3 -4
  102. transformers/models/blenderbot/configuration_blenderbot.py +8 -4
  103. transformers/models/blenderbot/modeling_blenderbot.py +4 -4
  104. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
  105. transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
  106. transformers/models/blip/configuration_blip.py +9 -9
  107. transformers/models/blip/modeling_blip.py +55 -37
  108. transformers/models/blip_2/configuration_blip_2.py +2 -1
  109. transformers/models/blip_2/modeling_blip_2.py +81 -56
  110. transformers/models/bloom/configuration_bloom.py +5 -1
  111. transformers/models/bloom/modeling_bloom.py +2 -1
  112. transformers/models/blt/configuration_blt.py +23 -12
  113. transformers/models/blt/modeling_blt.py +20 -14
  114. transformers/models/blt/modular_blt.py +70 -10
  115. transformers/models/bridgetower/configuration_bridgetower.py +7 -1
  116. transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
  117. transformers/models/bridgetower/modeling_bridgetower.py +29 -15
  118. transformers/models/bros/configuration_bros.py +24 -17
  119. transformers/models/camembert/configuration_camembert.py +8 -1
  120. transformers/models/camembert/modeling_camembert.py +6 -6
  121. transformers/models/canine/configuration_canine.py +4 -1
  122. transformers/models/chameleon/configuration_chameleon.py +5 -7
  123. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
  124. transformers/models/chameleon/modeling_chameleon.py +82 -36
  125. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
  126. transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
  127. transformers/models/clap/configuration_clap.py +4 -8
  128. transformers/models/clap/modeling_clap.py +21 -22
  129. transformers/models/clip/configuration_clip.py +4 -1
  130. transformers/models/clip/image_processing_clip_fast.py +9 -0
  131. transformers/models/clip/modeling_clip.py +25 -22
  132. transformers/models/clipseg/configuration_clipseg.py +4 -1
  133. transformers/models/clipseg/modeling_clipseg.py +27 -25
  134. transformers/models/clipseg/processing_clipseg.py +11 -3
  135. transformers/models/clvp/configuration_clvp.py +14 -2
  136. transformers/models/clvp/modeling_clvp.py +19 -30
  137. transformers/models/codegen/configuration_codegen.py +4 -3
  138. transformers/models/codegen/modeling_codegen.py +2 -1
  139. transformers/models/cohere/configuration_cohere.py +5 -7
  140. transformers/models/cohere/modeling_cohere.py +4 -4
  141. transformers/models/cohere/modular_cohere.py +3 -3
  142. transformers/models/cohere2/configuration_cohere2.py +6 -8
  143. transformers/models/cohere2/modeling_cohere2.py +4 -4
  144. transformers/models/cohere2/modular_cohere2.py +9 -11
  145. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  146. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
  147. transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
  148. transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
  149. transformers/models/colqwen2/modeling_colqwen2.py +7 -6
  150. transformers/models/colqwen2/modular_colqwen2.py +7 -6
  151. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
  152. transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
  153. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
  154. transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
  155. transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
  156. transformers/models/convbert/configuration_convbert.py +11 -7
  157. transformers/models/convnext/configuration_convnext.py +2 -4
  158. transformers/models/convnext/image_processing_convnext_fast.py +2 -2
  159. transformers/models/convnext/modeling_convnext.py +7 -6
  160. transformers/models/convnextv2/configuration_convnextv2.py +2 -4
  161. transformers/models/convnextv2/modeling_convnextv2.py +7 -6
  162. transformers/models/cpmant/configuration_cpmant.py +4 -0
  163. transformers/models/csm/configuration_csm.py +9 -15
  164. transformers/models/csm/modeling_csm.py +3 -3
  165. transformers/models/ctrl/configuration_ctrl.py +16 -0
  166. transformers/models/ctrl/modeling_ctrl.py +13 -25
  167. transformers/models/cwm/configuration_cwm.py +5 -7
  168. transformers/models/cwm/modeling_cwm.py +4 -4
  169. transformers/models/d_fine/configuration_d_fine.py +10 -56
  170. transformers/models/d_fine/modeling_d_fine.py +728 -868
  171. transformers/models/d_fine/modular_d_fine.py +335 -412
  172. transformers/models/dab_detr/configuration_dab_detr.py +22 -48
  173. transformers/models/dab_detr/modeling_dab_detr.py +11 -7
  174. transformers/models/dac/modeling_dac.py +1 -1
  175. transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
  176. transformers/models/data2vec/configuration_data2vec_text.py +11 -2
  177. transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
  178. transformers/models/data2vec/modeling_data2vec_text.py +6 -6
  179. transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
  180. transformers/models/dbrx/configuration_dbrx.py +11 -3
  181. transformers/models/dbrx/modeling_dbrx.py +6 -6
  182. transformers/models/dbrx/modular_dbrx.py +6 -6
  183. transformers/models/deberta/configuration_deberta.py +6 -0
  184. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
  185. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
  186. transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
  187. transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
  188. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
  189. transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
  190. transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
  191. transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
  192. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
  193. transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
  194. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
  195. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
  196. transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
  197. transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
  198. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
  199. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
  200. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
  201. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
  202. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
  203. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
  204. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
  205. transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
  206. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
  207. transformers/models/deit/modeling_deit.py +11 -7
  208. transformers/models/depth_anything/configuration_depth_anything.py +12 -42
  209. transformers/models/depth_anything/modeling_depth_anything.py +5 -3
  210. transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
  211. transformers/models/depth_pro/modeling_depth_pro.py +8 -4
  212. transformers/models/detr/configuration_detr.py +18 -49
  213. transformers/models/detr/image_processing_detr_fast.py +11 -11
  214. transformers/models/detr/modeling_detr.py +695 -734
  215. transformers/models/dia/configuration_dia.py +4 -7
  216. transformers/models/dia/generation_dia.py +8 -17
  217. transformers/models/dia/modeling_dia.py +7 -7
  218. transformers/models/dia/modular_dia.py +4 -4
  219. transformers/models/diffllama/configuration_diffllama.py +5 -7
  220. transformers/models/diffllama/modeling_diffllama.py +3 -8
  221. transformers/models/diffllama/modular_diffllama.py +2 -7
  222. transformers/models/dinat/configuration_dinat.py +2 -4
  223. transformers/models/dinat/modeling_dinat.py +7 -6
  224. transformers/models/dinov2/configuration_dinov2.py +2 -4
  225. transformers/models/dinov2/modeling_dinov2.py +9 -8
  226. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
  227. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
  228. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
  229. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
  230. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
  231. transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
  232. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
  233. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
  234. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
  235. transformers/models/distilbert/configuration_distilbert.py +8 -1
  236. transformers/models/distilbert/modeling_distilbert.py +3 -3
  237. transformers/models/doge/configuration_doge.py +17 -7
  238. transformers/models/doge/modeling_doge.py +4 -4
  239. transformers/models/doge/modular_doge.py +20 -10
  240. transformers/models/donut/image_processing_donut_fast.py +4 -4
  241. transformers/models/dots1/configuration_dots1.py +16 -7
  242. transformers/models/dots1/modeling_dots1.py +4 -4
  243. transformers/models/dpr/configuration_dpr.py +19 -1
  244. transformers/models/dpt/configuration_dpt.py +23 -65
  245. transformers/models/dpt/image_processing_dpt_fast.py +5 -5
  246. transformers/models/dpt/modeling_dpt.py +19 -15
  247. transformers/models/dpt/modular_dpt.py +4 -4
  248. transformers/models/edgetam/configuration_edgetam.py +1 -1
  249. transformers/models/edgetam/modeling_edgetam.py +53 -53
  250. transformers/models/edgetam/modular_edgetam.py +5 -7
  251. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
  252. transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
  253. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
  254. transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
  255. transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
  256. transformers/models/electra/configuration_electra.py +13 -2
  257. transformers/models/electra/modeling_electra.py +6 -6
  258. transformers/models/emu3/configuration_emu3.py +12 -10
  259. transformers/models/emu3/modeling_emu3.py +84 -47
  260. transformers/models/emu3/modular_emu3.py +77 -39
  261. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
  262. transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
  263. transformers/models/eomt/configuration_eomt.py +12 -13
  264. transformers/models/eomt/image_processing_eomt_fast.py +3 -3
  265. transformers/models/eomt/modeling_eomt.py +3 -3
  266. transformers/models/eomt/modular_eomt.py +17 -17
  267. transformers/models/eomt_dinov3/__init__.py +28 -0
  268. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  269. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  270. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  271. transformers/models/ernie/configuration_ernie.py +24 -2
  272. transformers/models/ernie/modeling_ernie.py +6 -30
  273. transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
  274. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  275. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
  276. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
  277. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
  278. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
  279. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
  280. transformers/models/esm/configuration_esm.py +9 -11
  281. transformers/models/esm/modeling_esm.py +3 -3
  282. transformers/models/esm/modeling_esmfold.py +1 -6
  283. transformers/models/esm/openfold_utils/protein.py +2 -3
  284. transformers/models/evolla/configuration_evolla.py +21 -8
  285. transformers/models/evolla/modeling_evolla.py +11 -7
  286. transformers/models/evolla/modular_evolla.py +5 -1
  287. transformers/models/exaone4/configuration_exaone4.py +8 -5
  288. transformers/models/exaone4/modeling_exaone4.py +4 -4
  289. transformers/models/exaone4/modular_exaone4.py +11 -8
  290. transformers/models/exaone_moe/__init__.py +27 -0
  291. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  292. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  293. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  294. transformers/models/falcon/configuration_falcon.py +9 -1
  295. transformers/models/falcon/modeling_falcon.py +3 -8
  296. transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
  297. transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
  298. transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
  299. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
  300. transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
  301. transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
  302. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
  303. transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
  304. transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
  305. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
  306. transformers/models/flaubert/configuration_flaubert.py +10 -4
  307. transformers/models/flaubert/modeling_flaubert.py +1 -1
  308. transformers/models/flava/configuration_flava.py +4 -3
  309. transformers/models/flava/image_processing_flava_fast.py +4 -4
  310. transformers/models/flava/modeling_flava.py +36 -28
  311. transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
  312. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
  313. transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
  314. transformers/models/florence2/configuration_florence2.py +4 -0
  315. transformers/models/florence2/modeling_florence2.py +57 -32
  316. transformers/models/florence2/modular_florence2.py +48 -26
  317. transformers/models/fnet/configuration_fnet.py +6 -1
  318. transformers/models/focalnet/configuration_focalnet.py +2 -4
  319. transformers/models/focalnet/modeling_focalnet.py +10 -7
  320. transformers/models/fsmt/configuration_fsmt.py +12 -16
  321. transformers/models/funnel/configuration_funnel.py +8 -0
  322. transformers/models/fuyu/configuration_fuyu.py +5 -8
  323. transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
  324. transformers/models/fuyu/modeling_fuyu.py +24 -23
  325. transformers/models/gemma/configuration_gemma.py +5 -7
  326. transformers/models/gemma/modeling_gemma.py +4 -4
  327. transformers/models/gemma/modular_gemma.py +5 -7
  328. transformers/models/gemma2/configuration_gemma2.py +5 -7
  329. transformers/models/gemma2/modeling_gemma2.py +4 -4
  330. transformers/models/gemma2/modular_gemma2.py +8 -10
  331. transformers/models/gemma3/configuration_gemma3.py +28 -22
  332. transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
  333. transformers/models/gemma3/modeling_gemma3.py +37 -33
  334. transformers/models/gemma3/modular_gemma3.py +46 -42
  335. transformers/models/gemma3n/configuration_gemma3n.py +35 -22
  336. transformers/models/gemma3n/modeling_gemma3n.py +86 -58
  337. transformers/models/gemma3n/modular_gemma3n.py +112 -75
  338. transformers/models/git/configuration_git.py +5 -7
  339. transformers/models/git/modeling_git.py +31 -41
  340. transformers/models/glm/configuration_glm.py +7 -9
  341. transformers/models/glm/modeling_glm.py +4 -4
  342. transformers/models/glm4/configuration_glm4.py +7 -9
  343. transformers/models/glm4/modeling_glm4.py +4 -4
  344. transformers/models/glm46v/configuration_glm46v.py +4 -0
  345. transformers/models/glm46v/image_processing_glm46v.py +5 -2
  346. transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
  347. transformers/models/glm46v/modeling_glm46v.py +91 -46
  348. transformers/models/glm46v/modular_glm46v.py +4 -0
  349. transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
  350. transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
  351. transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
  352. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
  353. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
  354. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
  355. transformers/models/glm4v/configuration_glm4v.py +12 -8
  356. transformers/models/glm4v/image_processing_glm4v.py +5 -2
  357. transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
  358. transformers/models/glm4v/modeling_glm4v.py +120 -63
  359. transformers/models/glm4v/modular_glm4v.py +82 -50
  360. transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
  361. transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
  362. transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
  363. transformers/models/glm_image/configuration_glm_image.py +26 -20
  364. transformers/models/glm_image/image_processing_glm_image.py +1 -1
  365. transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
  366. transformers/models/glm_image/modeling_glm_image.py +337 -236
  367. transformers/models/glm_image/modular_glm_image.py +415 -255
  368. transformers/models/glm_image/processing_glm_image.py +65 -17
  369. transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
  370. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  371. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  372. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  373. transformers/models/glmasr/modeling_glmasr.py +34 -28
  374. transformers/models/glmasr/modular_glmasr.py +23 -11
  375. transformers/models/glpn/image_processing_glpn_fast.py +3 -3
  376. transformers/models/glpn/modeling_glpn.py +4 -2
  377. transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
  378. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
  379. transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
  380. transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
  381. transformers/models/gpt2/configuration_gpt2.py +13 -1
  382. transformers/models/gpt2/modeling_gpt2.py +5 -5
  383. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
  384. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
  385. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
  386. transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
  387. transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
  388. transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
  389. transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
  390. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
  391. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
  392. transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
  393. transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
  394. transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
  395. transformers/models/gptj/configuration_gptj.py +4 -4
  396. transformers/models/gptj/modeling_gptj.py +3 -7
  397. transformers/models/granite/configuration_granite.py +5 -7
  398. transformers/models/granite/modeling_granite.py +4 -4
  399. transformers/models/granite_speech/modeling_granite_speech.py +63 -37
  400. transformers/models/granitemoe/configuration_granitemoe.py +5 -7
  401. transformers/models/granitemoe/modeling_granitemoe.py +4 -4
  402. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
  403. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
  404. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
  405. transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
  406. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
  407. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
  408. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
  409. transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
  410. transformers/models/groupvit/configuration_groupvit.py +4 -1
  411. transformers/models/groupvit/modeling_groupvit.py +29 -22
  412. transformers/models/helium/configuration_helium.py +5 -7
  413. transformers/models/helium/modeling_helium.py +4 -4
  414. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
  415. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
  416. transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
  417. transformers/models/hiera/configuration_hiera.py +2 -4
  418. transformers/models/hiera/modeling_hiera.py +11 -8
  419. transformers/models/hubert/configuration_hubert.py +4 -1
  420. transformers/models/hubert/modeling_hubert.py +7 -4
  421. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
  422. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
  423. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
  424. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
  425. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
  426. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
  427. transformers/models/ibert/configuration_ibert.py +4 -1
  428. transformers/models/idefics/configuration_idefics.py +5 -7
  429. transformers/models/idefics/modeling_idefics.py +3 -4
  430. transformers/models/idefics/vision.py +5 -4
  431. transformers/models/idefics2/configuration_idefics2.py +1 -2
  432. transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
  433. transformers/models/idefics2/modeling_idefics2.py +72 -50
  434. transformers/models/idefics3/configuration_idefics3.py +1 -3
  435. transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
  436. transformers/models/idefics3/modeling_idefics3.py +63 -40
  437. transformers/models/ijepa/modeling_ijepa.py +3 -3
  438. transformers/models/imagegpt/configuration_imagegpt.py +9 -1
  439. transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
  440. transformers/models/imagegpt/modeling_imagegpt.py +8 -4
  441. transformers/models/informer/modeling_informer.py +3 -3
  442. transformers/models/instructblip/configuration_instructblip.py +2 -1
  443. transformers/models/instructblip/modeling_instructblip.py +65 -39
  444. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
  445. transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
  446. transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
  447. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
  448. transformers/models/internvl/configuration_internvl.py +5 -0
  449. transformers/models/internvl/modeling_internvl.py +35 -55
  450. transformers/models/internvl/modular_internvl.py +26 -38
  451. transformers/models/internvl/video_processing_internvl.py +2 -2
  452. transformers/models/jais2/configuration_jais2.py +5 -7
  453. transformers/models/jais2/modeling_jais2.py +4 -4
  454. transformers/models/jamba/configuration_jamba.py +5 -7
  455. transformers/models/jamba/modeling_jamba.py +4 -4
  456. transformers/models/jamba/modular_jamba.py +3 -3
  457. transformers/models/janus/image_processing_janus.py +2 -2
  458. transformers/models/janus/image_processing_janus_fast.py +8 -8
  459. transformers/models/janus/modeling_janus.py +63 -146
  460. transformers/models/janus/modular_janus.py +62 -20
  461. transformers/models/jetmoe/configuration_jetmoe.py +6 -4
  462. transformers/models/jetmoe/modeling_jetmoe.py +3 -3
  463. transformers/models/jetmoe/modular_jetmoe.py +3 -3
  464. transformers/models/kosmos2/configuration_kosmos2.py +10 -8
  465. transformers/models/kosmos2/modeling_kosmos2.py +56 -34
  466. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
  467. transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
  468. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
  469. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
  470. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
  471. transformers/models/lasr/configuration_lasr.py +2 -4
  472. transformers/models/lasr/modeling_lasr.py +3 -3
  473. transformers/models/lasr/modular_lasr.py +3 -3
  474. transformers/models/layoutlm/configuration_layoutlm.py +14 -1
  475. transformers/models/layoutlm/modeling_layoutlm.py +3 -3
  476. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
  477. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
  478. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
  479. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
  480. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
  481. transformers/models/led/configuration_led.py +7 -8
  482. transformers/models/levit/image_processing_levit_fast.py +4 -4
  483. transformers/models/lfm2/configuration_lfm2.py +5 -7
  484. transformers/models/lfm2/modeling_lfm2.py +4 -4
  485. transformers/models/lfm2/modular_lfm2.py +3 -3
  486. transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
  487. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
  488. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  489. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
  490. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
  491. transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
  492. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
  493. transformers/models/lightglue/modeling_lightglue.py +3 -3
  494. transformers/models/lightglue/modular_lightglue.py +3 -3
  495. transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
  496. transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
  497. transformers/models/lilt/configuration_lilt.py +6 -1
  498. transformers/models/llama/configuration_llama.py +5 -7
  499. transformers/models/llama/modeling_llama.py +4 -4
  500. transformers/models/llama4/configuration_llama4.py +67 -47
  501. transformers/models/llama4/image_processing_llama4_fast.py +3 -3
  502. transformers/models/llama4/modeling_llama4.py +46 -44
  503. transformers/models/llava/configuration_llava.py +10 -0
  504. transformers/models/llava/image_processing_llava_fast.py +3 -3
  505. transformers/models/llava/modeling_llava.py +38 -65
  506. transformers/models/llava_next/configuration_llava_next.py +2 -1
  507. transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
  508. transformers/models/llava_next/modeling_llava_next.py +61 -60
  509. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
  510. transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
  511. transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
  512. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
  513. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
  514. transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
  515. transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
  516. transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
  517. transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
  518. transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
  519. transformers/models/longformer/configuration_longformer.py +4 -1
  520. transformers/models/longt5/configuration_longt5.py +9 -6
  521. transformers/models/longt5/modeling_longt5.py +2 -1
  522. transformers/models/luke/configuration_luke.py +8 -1
  523. transformers/models/lw_detr/configuration_lw_detr.py +19 -31
  524. transformers/models/lw_detr/modeling_lw_detr.py +43 -44
  525. transformers/models/lw_detr/modular_lw_detr.py +36 -38
  526. transformers/models/lxmert/configuration_lxmert.py +16 -0
  527. transformers/models/m2m_100/configuration_m2m_100.py +7 -8
  528. transformers/models/m2m_100/modeling_m2m_100.py +3 -3
  529. transformers/models/mamba/configuration_mamba.py +5 -2
  530. transformers/models/mamba/modeling_mamba.py +18 -26
  531. transformers/models/mamba2/configuration_mamba2.py +5 -7
  532. transformers/models/mamba2/modeling_mamba2.py +22 -33
  533. transformers/models/marian/configuration_marian.py +10 -4
  534. transformers/models/marian/modeling_marian.py +4 -4
  535. transformers/models/markuplm/configuration_markuplm.py +4 -6
  536. transformers/models/markuplm/modeling_markuplm.py +3 -3
  537. transformers/models/mask2former/configuration_mask2former.py +12 -47
  538. transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
  539. transformers/models/mask2former/modeling_mask2former.py +18 -12
  540. transformers/models/maskformer/configuration_maskformer.py +14 -45
  541. transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
  542. transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
  543. transformers/models/maskformer/modeling_maskformer.py +15 -9
  544. transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
  545. transformers/models/mbart/configuration_mbart.py +9 -4
  546. transformers/models/mbart/modeling_mbart.py +9 -6
  547. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
  548. transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
  549. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  550. transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
  551. transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
  552. transformers/models/mgp_str/modeling_mgp_str.py +4 -2
  553. transformers/models/mimi/configuration_mimi.py +4 -0
  554. transformers/models/mimi/modeling_mimi.py +40 -36
  555. transformers/models/minimax/configuration_minimax.py +8 -11
  556. transformers/models/minimax/modeling_minimax.py +5 -5
  557. transformers/models/minimax/modular_minimax.py +9 -12
  558. transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
  559. transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
  560. transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
  561. transformers/models/ministral/configuration_ministral.py +5 -7
  562. transformers/models/ministral/modeling_ministral.py +4 -4
  563. transformers/models/ministral/modular_ministral.py +5 -8
  564. transformers/models/ministral3/configuration_ministral3.py +4 -4
  565. transformers/models/ministral3/modeling_ministral3.py +4 -4
  566. transformers/models/ministral3/modular_ministral3.py +3 -3
  567. transformers/models/mistral/configuration_mistral.py +5 -7
  568. transformers/models/mistral/modeling_mistral.py +4 -4
  569. transformers/models/mistral/modular_mistral.py +3 -3
  570. transformers/models/mistral3/configuration_mistral3.py +4 -0
  571. transformers/models/mistral3/modeling_mistral3.py +36 -40
  572. transformers/models/mistral3/modular_mistral3.py +31 -32
  573. transformers/models/mixtral/configuration_mixtral.py +8 -11
  574. transformers/models/mixtral/modeling_mixtral.py +4 -4
  575. transformers/models/mlcd/modeling_mlcd.py +7 -5
  576. transformers/models/mlcd/modular_mlcd.py +7 -5
  577. transformers/models/mllama/configuration_mllama.py +5 -7
  578. transformers/models/mllama/image_processing_mllama_fast.py +6 -5
  579. transformers/models/mllama/modeling_mllama.py +19 -19
  580. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
  581. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
  582. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
  583. transformers/models/mobilebert/configuration_mobilebert.py +4 -1
  584. transformers/models/mobilebert/modeling_mobilebert.py +3 -3
  585. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
  586. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
  587. transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
  588. transformers/models/mobilevit/modeling_mobilevit.py +4 -2
  589. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
  590. transformers/models/modernbert/configuration_modernbert.py +46 -21
  591. transformers/models/modernbert/modeling_modernbert.py +146 -899
  592. transformers/models/modernbert/modular_modernbert.py +185 -908
  593. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
  594. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
  595. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
  596. transformers/models/moonshine/configuration_moonshine.py +12 -7
  597. transformers/models/moonshine/modeling_moonshine.py +7 -7
  598. transformers/models/moonshine/modular_moonshine.py +19 -13
  599. transformers/models/moshi/configuration_moshi.py +28 -2
  600. transformers/models/moshi/modeling_moshi.py +4 -9
  601. transformers/models/mpnet/configuration_mpnet.py +6 -1
  602. transformers/models/mpt/configuration_mpt.py +16 -0
  603. transformers/models/mra/configuration_mra.py +8 -1
  604. transformers/models/mt5/configuration_mt5.py +9 -5
  605. transformers/models/mt5/modeling_mt5.py +5 -8
  606. transformers/models/musicgen/configuration_musicgen.py +12 -7
  607. transformers/models/musicgen/modeling_musicgen.py +6 -5
  608. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
  609. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
  610. transformers/models/mvp/configuration_mvp.py +8 -4
  611. transformers/models/mvp/modeling_mvp.py +6 -4
  612. transformers/models/nanochat/configuration_nanochat.py +5 -7
  613. transformers/models/nanochat/modeling_nanochat.py +4 -4
  614. transformers/models/nanochat/modular_nanochat.py +4 -4
  615. transformers/models/nemotron/configuration_nemotron.py +5 -7
  616. transformers/models/nemotron/modeling_nemotron.py +4 -14
  617. transformers/models/nllb/tokenization_nllb.py +7 -5
  618. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
  619. transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
  620. transformers/models/nougat/image_processing_nougat_fast.py +8 -8
  621. transformers/models/nystromformer/configuration_nystromformer.py +8 -1
  622. transformers/models/olmo/configuration_olmo.py +5 -7
  623. transformers/models/olmo/modeling_olmo.py +4 -4
  624. transformers/models/olmo/modular_olmo.py +3 -3
  625. transformers/models/olmo2/configuration_olmo2.py +9 -11
  626. transformers/models/olmo2/modeling_olmo2.py +4 -4
  627. transformers/models/olmo2/modular_olmo2.py +7 -7
  628. transformers/models/olmo3/configuration_olmo3.py +10 -11
  629. transformers/models/olmo3/modeling_olmo3.py +4 -4
  630. transformers/models/olmo3/modular_olmo3.py +13 -14
  631. transformers/models/olmoe/configuration_olmoe.py +5 -7
  632. transformers/models/olmoe/modeling_olmoe.py +4 -4
  633. transformers/models/olmoe/modular_olmoe.py +3 -3
  634. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
  635. transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
  636. transformers/models/oneformer/configuration_oneformer.py +9 -46
  637. transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
  638. transformers/models/oneformer/modeling_oneformer.py +14 -9
  639. transformers/models/openai/configuration_openai.py +16 -0
  640. transformers/models/opt/configuration_opt.py +6 -6
  641. transformers/models/opt/modeling_opt.py +5 -5
  642. transformers/models/ovis2/configuration_ovis2.py +4 -0
  643. transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
  644. transformers/models/ovis2/modeling_ovis2.py +58 -99
  645. transformers/models/ovis2/modular_ovis2.py +52 -13
  646. transformers/models/owlv2/configuration_owlv2.py +4 -1
  647. transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
  648. transformers/models/owlv2/modeling_owlv2.py +40 -27
  649. transformers/models/owlv2/modular_owlv2.py +5 -5
  650. transformers/models/owlvit/configuration_owlvit.py +4 -1
  651. transformers/models/owlvit/modeling_owlvit.py +40 -27
  652. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
  653. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
  654. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
  655. transformers/models/paligemma/configuration_paligemma.py +4 -0
  656. transformers/models/paligemma/modeling_paligemma.py +30 -26
  657. transformers/models/parakeet/configuration_parakeet.py +2 -4
  658. transformers/models/parakeet/modeling_parakeet.py +3 -3
  659. transformers/models/parakeet/modular_parakeet.py +3 -3
  660. transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
  661. transformers/models/patchtst/modeling_patchtst.py +3 -3
  662. transformers/models/pe_audio/modeling_pe_audio.py +4 -4
  663. transformers/models/pe_audio/modular_pe_audio.py +1 -1
  664. transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
  665. transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
  666. transformers/models/pe_video/modeling_pe_video.py +36 -24
  667. transformers/models/pe_video/modular_pe_video.py +36 -23
  668. transformers/models/pegasus/configuration_pegasus.py +8 -5
  669. transformers/models/pegasus/modeling_pegasus.py +4 -4
  670. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
  671. transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
  672. transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
  673. transformers/models/perceiver/modeling_perceiver.py +17 -9
  674. transformers/models/perception_lm/modeling_perception_lm.py +26 -27
  675. transformers/models/perception_lm/modular_perception_lm.py +27 -25
  676. transformers/models/persimmon/configuration_persimmon.py +5 -7
  677. transformers/models/persimmon/modeling_persimmon.py +5 -5
  678. transformers/models/phi/configuration_phi.py +8 -6
  679. transformers/models/phi/modeling_phi.py +4 -4
  680. transformers/models/phi/modular_phi.py +3 -3
  681. transformers/models/phi3/configuration_phi3.py +9 -11
  682. transformers/models/phi3/modeling_phi3.py +4 -4
  683. transformers/models/phi3/modular_phi3.py +3 -3
  684. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
  685. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
  686. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
  687. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
  688. transformers/models/phimoe/configuration_phimoe.py +5 -7
  689. transformers/models/phimoe/modeling_phimoe.py +15 -39
  690. transformers/models/phimoe/modular_phimoe.py +12 -7
  691. transformers/models/pix2struct/configuration_pix2struct.py +12 -9
  692. transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
  693. transformers/models/pix2struct/modeling_pix2struct.py +14 -7
  694. transformers/models/pixio/configuration_pixio.py +2 -4
  695. transformers/models/pixio/modeling_pixio.py +9 -8
  696. transformers/models/pixio/modular_pixio.py +4 -2
  697. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
  698. transformers/models/pixtral/modeling_pixtral.py +9 -12
  699. transformers/models/plbart/configuration_plbart.py +8 -5
  700. transformers/models/plbart/modeling_plbart.py +9 -7
  701. transformers/models/plbart/modular_plbart.py +1 -1
  702. transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
  703. transformers/models/pop2piano/configuration_pop2piano.py +7 -6
  704. transformers/models/pop2piano/modeling_pop2piano.py +2 -1
  705. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  706. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  707. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  708. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  709. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  710. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
  711. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
  712. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
  713. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
  714. transformers/models/prophetnet/configuration_prophetnet.py +11 -10
  715. transformers/models/prophetnet/modeling_prophetnet.py +12 -23
  716. transformers/models/pvt/image_processing_pvt.py +7 -7
  717. transformers/models/pvt/image_processing_pvt_fast.py +1 -1
  718. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  719. transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
  720. transformers/models/qwen2/configuration_qwen2.py +14 -4
  721. transformers/models/qwen2/modeling_qwen2.py +4 -4
  722. transformers/models/qwen2/modular_qwen2.py +3 -3
  723. transformers/models/qwen2/tokenization_qwen2.py +0 -4
  724. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
  725. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
  726. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
  727. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
  728. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
  729. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
  730. transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
  731. transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
  732. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  733. transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
  734. transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
  735. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
  736. transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
  737. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
  738. transformers/models/qwen3/configuration_qwen3.py +15 -5
  739. transformers/models/qwen3/modeling_qwen3.py +4 -4
  740. transformers/models/qwen3/modular_qwen3.py +3 -3
  741. transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
  742. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  743. transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
  744. transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
  745. transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
  746. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
  747. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
  748. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
  749. transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
  750. transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
  751. transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
  752. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
  753. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
  754. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
  755. transformers/models/rag/configuration_rag.py +6 -6
  756. transformers/models/rag/modeling_rag.py +3 -3
  757. transformers/models/rag/retrieval_rag.py +1 -1
  758. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
  759. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
  760. transformers/models/reformer/configuration_reformer.py +7 -7
  761. transformers/models/rembert/configuration_rembert.py +8 -1
  762. transformers/models/rembert/modeling_rembert.py +0 -22
  763. transformers/models/resnet/configuration_resnet.py +2 -4
  764. transformers/models/resnet/modeling_resnet.py +6 -5
  765. transformers/models/roberta/configuration_roberta.py +11 -2
  766. transformers/models/roberta/modeling_roberta.py +6 -6
  767. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
  768. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
  769. transformers/models/roc_bert/configuration_roc_bert.py +8 -1
  770. transformers/models/roc_bert/modeling_roc_bert.py +6 -41
  771. transformers/models/roformer/configuration_roformer.py +13 -2
  772. transformers/models/roformer/modeling_roformer.py +0 -14
  773. transformers/models/rt_detr/configuration_rt_detr.py +8 -49
  774. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
  775. transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
  776. transformers/models/rt_detr/modeling_rt_detr.py +578 -737
  777. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
  778. transformers/models/rt_detr/modular_rt_detr.py +1508 -6
  779. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
  780. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
  781. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
  782. transformers/models/rwkv/configuration_rwkv.py +2 -3
  783. transformers/models/rwkv/modeling_rwkv.py +0 -23
  784. transformers/models/sam/configuration_sam.py +2 -0
  785. transformers/models/sam/image_processing_sam_fast.py +4 -4
  786. transformers/models/sam/modeling_sam.py +13 -8
  787. transformers/models/sam/processing_sam.py +3 -3
  788. transformers/models/sam2/configuration_sam2.py +1 -1
  789. transformers/models/sam2/modeling_sam2.py +56 -52
  790. transformers/models/sam2/modular_sam2.py +47 -55
  791. transformers/models/sam2_video/modeling_sam2_video.py +50 -51
  792. transformers/models/sam2_video/modular_sam2_video.py +12 -10
  793. transformers/models/sam3/modeling_sam3.py +43 -47
  794. transformers/models/sam3/processing_sam3.py +8 -4
  795. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
  796. transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
  797. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  798. transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
  799. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
  800. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
  801. transformers/models/sam3_video/modeling_sam3_video.py +27 -14
  802. transformers/models/sam_hq/configuration_sam_hq.py +2 -0
  803. transformers/models/sam_hq/modeling_sam_hq.py +13 -9
  804. transformers/models/sam_hq/modular_sam_hq.py +6 -6
  805. transformers/models/sam_hq/processing_sam_hq.py +7 -6
  806. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
  807. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
  808. transformers/models/seed_oss/configuration_seed_oss.py +7 -9
  809. transformers/models/seed_oss/modeling_seed_oss.py +4 -4
  810. transformers/models/seed_oss/modular_seed_oss.py +3 -3
  811. transformers/models/segformer/image_processing_segformer_fast.py +4 -4
  812. transformers/models/segformer/modeling_segformer.py +4 -2
  813. transformers/models/segformer/modular_segformer.py +3 -3
  814. transformers/models/seggpt/modeling_seggpt.py +20 -8
  815. transformers/models/sew/configuration_sew.py +4 -1
  816. transformers/models/sew/modeling_sew.py +9 -5
  817. transformers/models/sew/modular_sew.py +2 -1
  818. transformers/models/sew_d/configuration_sew_d.py +4 -1
  819. transformers/models/sew_d/modeling_sew_d.py +4 -1
  820. transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
  821. transformers/models/siglip/configuration_siglip.py +4 -1
  822. transformers/models/siglip/modeling_siglip.py +27 -71
  823. transformers/models/siglip2/__init__.py +1 -0
  824. transformers/models/siglip2/configuration_siglip2.py +4 -2
  825. transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
  826. transformers/models/siglip2/modeling_siglip2.py +37 -78
  827. transformers/models/siglip2/modular_siglip2.py +74 -25
  828. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  829. transformers/models/smollm3/configuration_smollm3.py +6 -6
  830. transformers/models/smollm3/modeling_smollm3.py +4 -4
  831. transformers/models/smollm3/modular_smollm3.py +9 -9
  832. transformers/models/smolvlm/configuration_smolvlm.py +1 -3
  833. transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
  834. transformers/models/smolvlm/modeling_smolvlm.py +75 -46
  835. transformers/models/smolvlm/modular_smolvlm.py +36 -23
  836. transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
  837. transformers/models/solar_open/__init__.py +27 -0
  838. transformers/models/solar_open/configuration_solar_open.py +184 -0
  839. transformers/models/solar_open/modeling_solar_open.py +642 -0
  840. transformers/models/solar_open/modular_solar_open.py +224 -0
  841. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
  842. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
  843. transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
  844. transformers/models/speecht5/configuration_speecht5.py +7 -8
  845. transformers/models/splinter/configuration_splinter.py +6 -6
  846. transformers/models/splinter/modeling_splinter.py +8 -3
  847. transformers/models/squeezebert/configuration_squeezebert.py +14 -1
  848. transformers/models/stablelm/configuration_stablelm.py +8 -6
  849. transformers/models/stablelm/modeling_stablelm.py +5 -5
  850. transformers/models/starcoder2/configuration_starcoder2.py +11 -5
  851. transformers/models/starcoder2/modeling_starcoder2.py +5 -5
  852. transformers/models/starcoder2/modular_starcoder2.py +4 -4
  853. transformers/models/superglue/configuration_superglue.py +4 -0
  854. transformers/models/superglue/image_processing_superglue_fast.py +4 -3
  855. transformers/models/superglue/modeling_superglue.py +9 -4
  856. transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
  857. transformers/models/superpoint/modeling_superpoint.py +4 -2
  858. transformers/models/swin/configuration_swin.py +2 -4
  859. transformers/models/swin/modeling_swin.py +11 -8
  860. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
  861. transformers/models/swin2sr/modeling_swin2sr.py +4 -2
  862. transformers/models/swinv2/configuration_swinv2.py +2 -4
  863. transformers/models/swinv2/modeling_swinv2.py +10 -7
  864. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
  865. transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
  866. transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
  867. transformers/models/t5/configuration_t5.py +9 -8
  868. transformers/models/t5/modeling_t5.py +5 -8
  869. transformers/models/t5gemma/configuration_t5gemma.py +10 -25
  870. transformers/models/t5gemma/modeling_t5gemma.py +9 -9
  871. transformers/models/t5gemma/modular_t5gemma.py +11 -24
  872. transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
  873. transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
  874. transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
  875. transformers/models/table_transformer/configuration_table_transformer.py +18 -49
  876. transformers/models/table_transformer/modeling_table_transformer.py +27 -53
  877. transformers/models/tapas/configuration_tapas.py +12 -1
  878. transformers/models/tapas/modeling_tapas.py +1 -1
  879. transformers/models/tapas/tokenization_tapas.py +1 -0
  880. transformers/models/textnet/configuration_textnet.py +4 -6
  881. transformers/models/textnet/image_processing_textnet_fast.py +3 -3
  882. transformers/models/textnet/modeling_textnet.py +15 -14
  883. transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
  884. transformers/models/timesfm/modeling_timesfm.py +5 -6
  885. transformers/models/timesfm/modular_timesfm.py +5 -6
  886. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
  887. transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
  888. transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
  889. transformers/models/trocr/configuration_trocr.py +11 -7
  890. transformers/models/trocr/modeling_trocr.py +4 -2
  891. transformers/models/tvp/configuration_tvp.py +10 -35
  892. transformers/models/tvp/image_processing_tvp_fast.py +6 -5
  893. transformers/models/tvp/modeling_tvp.py +1 -1
  894. transformers/models/udop/configuration_udop.py +16 -7
  895. transformers/models/udop/modeling_udop.py +10 -6
  896. transformers/models/umt5/configuration_umt5.py +8 -6
  897. transformers/models/umt5/modeling_umt5.py +7 -3
  898. transformers/models/unispeech/configuration_unispeech.py +4 -1
  899. transformers/models/unispeech/modeling_unispeech.py +7 -4
  900. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
  901. transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
  902. transformers/models/upernet/configuration_upernet.py +8 -35
  903. transformers/models/upernet/modeling_upernet.py +1 -1
  904. transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
  905. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  906. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  907. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
  908. transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
  909. transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
  910. transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
  911. transformers/models/video_llava/configuration_video_llava.py +4 -0
  912. transformers/models/video_llava/modeling_video_llava.py +87 -89
  913. transformers/models/videomae/modeling_videomae.py +4 -5
  914. transformers/models/vilt/configuration_vilt.py +4 -1
  915. transformers/models/vilt/image_processing_vilt_fast.py +6 -6
  916. transformers/models/vilt/modeling_vilt.py +27 -12
  917. transformers/models/vipllava/configuration_vipllava.py +4 -0
  918. transformers/models/vipllava/modeling_vipllava.py +57 -31
  919. transformers/models/vipllava/modular_vipllava.py +50 -24
  920. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
  921. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
  922. transformers/models/visual_bert/configuration_visual_bert.py +6 -1
  923. transformers/models/vit/configuration_vit.py +2 -2
  924. transformers/models/vit/modeling_vit.py +7 -5
  925. transformers/models/vit_mae/modeling_vit_mae.py +11 -7
  926. transformers/models/vit_msn/modeling_vit_msn.py +11 -7
  927. transformers/models/vitdet/configuration_vitdet.py +2 -4
  928. transformers/models/vitdet/modeling_vitdet.py +2 -3
  929. transformers/models/vitmatte/configuration_vitmatte.py +6 -35
  930. transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
  931. transformers/models/vitmatte/modeling_vitmatte.py +1 -1
  932. transformers/models/vitpose/configuration_vitpose.py +6 -43
  933. transformers/models/vitpose/modeling_vitpose.py +5 -3
  934. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
  935. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
  936. transformers/models/vits/configuration_vits.py +4 -0
  937. transformers/models/vits/modeling_vits.py +9 -7
  938. transformers/models/vivit/modeling_vivit.py +4 -4
  939. transformers/models/vjepa2/modeling_vjepa2.py +9 -9
  940. transformers/models/voxtral/configuration_voxtral.py +0 -1
  941. transformers/models/voxtral/modeling_voxtral.py +25 -24
  942. transformers/models/voxtral/modular_voxtral.py +26 -20
  943. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
  944. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
  945. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
  946. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
  947. transformers/models/wavlm/configuration_wavlm.py +4 -1
  948. transformers/models/wavlm/modeling_wavlm.py +4 -1
  949. transformers/models/whisper/configuration_whisper.py +6 -4
  950. transformers/models/whisper/generation_whisper.py +0 -1
  951. transformers/models/whisper/modeling_whisper.py +3 -3
  952. transformers/models/x_clip/configuration_x_clip.py +4 -1
  953. transformers/models/x_clip/modeling_x_clip.py +26 -27
  954. transformers/models/xglm/configuration_xglm.py +9 -7
  955. transformers/models/xlm/configuration_xlm.py +10 -7
  956. transformers/models/xlm/modeling_xlm.py +1 -1
  957. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
  958. transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
  959. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
  960. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
  961. transformers/models/xlnet/configuration_xlnet.py +3 -1
  962. transformers/models/xlstm/configuration_xlstm.py +5 -7
  963. transformers/models/xlstm/modeling_xlstm.py +0 -32
  964. transformers/models/xmod/configuration_xmod.py +11 -2
  965. transformers/models/xmod/modeling_xmod.py +13 -16
  966. transformers/models/yolos/image_processing_yolos_fast.py +25 -28
  967. transformers/models/yolos/modeling_yolos.py +7 -7
  968. transformers/models/yolos/modular_yolos.py +16 -16
  969. transformers/models/yoso/configuration_yoso.py +8 -1
  970. transformers/models/youtu/__init__.py +27 -0
  971. transformers/models/youtu/configuration_youtu.py +194 -0
  972. transformers/models/youtu/modeling_youtu.py +619 -0
  973. transformers/models/youtu/modular_youtu.py +254 -0
  974. transformers/models/zamba/configuration_zamba.py +5 -7
  975. transformers/models/zamba/modeling_zamba.py +25 -56
  976. transformers/models/zamba2/configuration_zamba2.py +8 -13
  977. transformers/models/zamba2/modeling_zamba2.py +53 -78
  978. transformers/models/zamba2/modular_zamba2.py +36 -29
  979. transformers/models/zoedepth/configuration_zoedepth.py +17 -40
  980. transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
  981. transformers/models/zoedepth/modeling_zoedepth.py +5 -3
  982. transformers/pipelines/__init__.py +1 -61
  983. transformers/pipelines/any_to_any.py +1 -1
  984. transformers/pipelines/automatic_speech_recognition.py +0 -2
  985. transformers/pipelines/base.py +1 -1
  986. transformers/pipelines/image_text_to_text.py +1 -1
  987. transformers/pipelines/text_to_audio.py +5 -1
  988. transformers/processing_utils.py +35 -44
  989. transformers/pytorch_utils.py +2 -26
  990. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  991. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  992. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  993. transformers/quantizers/quantizer_mxfp4.py +1 -1
  994. transformers/quantizers/quantizer_torchao.py +0 -16
  995. transformers/safetensors_conversion.py +11 -4
  996. transformers/testing_utils.py +3 -28
  997. transformers/tokenization_mistral_common.py +9 -0
  998. transformers/tokenization_python.py +6 -4
  999. transformers/tokenization_utils_base.py +119 -219
  1000. transformers/tokenization_utils_tokenizers.py +31 -2
  1001. transformers/trainer.py +25 -33
  1002. transformers/trainer_seq2seq.py +1 -1
  1003. transformers/training_args.py +411 -417
  1004. transformers/utils/__init__.py +1 -4
  1005. transformers/utils/auto_docstring.py +15 -18
  1006. transformers/utils/backbone_utils.py +13 -373
  1007. transformers/utils/doc.py +4 -36
  1008. transformers/utils/generic.py +69 -33
  1009. transformers/utils/import_utils.py +72 -75
  1010. transformers/utils/loading_report.py +133 -105
  1011. transformers/utils/quantization_config.py +0 -21
  1012. transformers/video_processing_utils.py +5 -5
  1013. transformers/video_utils.py +3 -1
  1014. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
  1015. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
  1016. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1017. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1018. transformers/pipelines/image_to_text.py +0 -189
  1019. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1020. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1021. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -19,7 +19,6 @@
19
19
  # See the License for the specific language governing permissions and
20
20
  # limitations under the License.
21
21
 
22
-
23
22
  import math
24
23
  from collections.abc import Callable
25
24
 
@@ -32,10 +31,11 @@ from ...generation import GenerationMixin
32
31
  from ...masking_utils import create_bidirectional_mask
33
32
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
34
33
  from ...modeling_layers import GradientCheckpointingLayer
35
- from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
34
+ from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast
36
35
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
37
36
  from ...processing_utils import Unpack
38
37
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
38
+ from ...utils.generic import check_model_inputs
39
39
  from ..auto import AutoModel, AutoModelForCausalLM
40
40
  from .configuration_audioflamingo3 import AudioFlamingo3Config, AudioFlamingo3EncoderConfig
41
41
 
@@ -171,9 +171,9 @@ class AudioFlamingo3Attention(nn.Module):
171
171
  key_states, value_states, self.layer_idx, {"cache_position": cache_position}
172
172
  )
173
173
 
174
- attention_interface: Callable = eager_attention_forward
175
- if self.config._attn_implementation != "eager":
176
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
174
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
175
+ self.config._attn_implementation, eager_attention_forward
176
+ )
177
177
 
178
178
  attn_output, attn_weights = attention_interface(
179
179
  self,
@@ -280,6 +280,11 @@ class AudioFlamingo3Encoder(AudioFlamingo3PreTrainedModel):
280
280
  input_modalities = "audio"
281
281
  _no_split_modules = ["AudioFlamingo3EncoderLayer"]
282
282
 
283
+ _can_record_outputs = {
284
+ "hidden_states": AudioFlamingo3EncoderLayer,
285
+ "attentions": AudioFlamingo3Attention,
286
+ }
287
+
283
288
  def __init__(self, config: AudioFlamingo3EncoderConfig):
284
289
  super().__init__(config)
285
290
  self.dropout = config.dropout
@@ -287,7 +292,6 @@ class AudioFlamingo3Encoder(AudioFlamingo3PreTrainedModel):
287
292
 
288
293
  embed_dim = config.d_model
289
294
  self.num_mel_bins = config.num_mel_bins
290
- self.padding_idx = config.pad_token_id
291
295
  self.max_source_positions = config.max_source_positions
292
296
  self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
293
297
 
@@ -317,13 +321,13 @@ class AudioFlamingo3Encoder(AudioFlamingo3PreTrainedModel):
317
321
  def set_input_embeddings(self, value: nn.Module):
318
322
  self.conv1 = value
319
323
 
320
- @can_return_tuple
324
+ @check_model_inputs
321
325
  def forward(
322
326
  self,
323
327
  input_features: torch.Tensor,
324
328
  input_features_mask: torch.Tensor | None = None,
325
329
  **kwargs,
326
- ):
330
+ ) -> tuple | BaseModelOutputWithPooling:
327
331
  r"""
328
332
  Args:
329
333
  input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
@@ -367,7 +371,7 @@ class AudioFlamingo3Encoder(AudioFlamingo3PreTrainedModel):
367
371
  hidden_states = self.avg_pooler(hidden_states).permute(0, 2, 1)
368
372
  hidden_states = self.layer_norm(hidden_states)
369
373
 
370
- return BaseModelOutput(
374
+ return BaseModelOutputWithPooling(
371
375
  last_hidden_state=hidden_states,
372
376
  )
373
377
 
@@ -442,35 +446,40 @@ class AudioFlamingo3ForConditionalGeneration(AudioFlamingo3PreTrainedModel, Gene
442
446
  def get_decoder(self):
443
447
  return self.language_model.get_decoder()
444
448
 
449
+ @can_return_tuple
450
+ @auto_docstring(
451
+ custom_intro="This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector."
452
+ )
445
453
  def get_audio_features(
446
- self, input_features: torch.FloatTensor, input_features_mask: torch.Tensor
447
- ) -> torch.FloatTensor:
448
- """
449
- This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
450
- Args:
451
- input_features (`torch.FloatTensor`):
452
- Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
453
- obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
454
- `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
455
- `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
456
- and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
457
- input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
458
- Mask to avoid performing attention on padded feature indices.
459
-
460
- Returns:
461
- `torch.FloatTensor`:
462
- The audio embeddings.
454
+ self,
455
+ input_features: torch.FloatTensor,
456
+ input_features_mask: torch.Tensor,
457
+ **kwargs: Unpack[TransformersKwargs],
458
+ ) -> tuple | BaseModelOutputWithPooling:
459
+ r"""
460
+ input_features (`torch.FloatTensor`):
461
+ Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
462
+ obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
463
+ `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
464
+ `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
465
+ and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
466
+ input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
467
+ Mask to avoid performing attention on padded feature indices.
463
468
  """
464
469
 
465
470
  # Encode audio
466
- encoder_output = self.audio_tower(input_features, input_features_mask=input_features_mask)
467
- audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
471
+ audio_output = self.audio_tower(
472
+ input_features, input_features_mask=input_features_mask, return_dict=True, **kwargs
473
+ )
474
+ audio_embeds = self.multi_modal_projector(audio_output.last_hidden_state)
468
475
 
469
476
  # Mask according to avg pooling (which is after attention blocks)
470
477
  post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1
471
478
  valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
472
479
  audio_embeds = audio_embeds[valid_mask.to(audio_embeds.device)]
473
- return audio_embeds
480
+ audio_output.pooler_output = audio_embeds
481
+
482
+ return audio_output
474
483
 
475
484
  @can_return_tuple
476
485
  @auto_docstring
@@ -556,7 +565,7 @@ class AudioFlamingo3ForConditionalGeneration(AudioFlamingo3PreTrainedModel, Gene
556
565
  inputs_embeds = self.get_input_embeddings()(input_ids)
557
566
 
558
567
  if input_features is not None and input_ids is not None:
559
- audio_embeds = self.get_audio_features(input_features, input_features_mask)
568
+ audio_embeds = self.get_audio_features(input_features, input_features_mask, return_dict=True).pooler_output
560
569
 
561
570
  # replace text-audio token placeholders with audio embeddings
562
571
  audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
@@ -13,28 +13,32 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
-
17
16
  import torch
18
17
  from torch import nn
19
18
 
20
19
  from ...activations import ACT2FN
21
20
  from ...cache_utils import Cache
22
21
  from ...masking_utils import create_bidirectional_mask
23
- from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
22
+ from ...modeling_outputs import BaseModelOutputWithPooling, CausalLMOutputWithPast
24
23
  from ...processing_utils import Unpack
25
24
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
25
+ from ...utils.generic import check_model_inputs
26
26
  from ..qwen2_audio.modeling_qwen2_audio import (
27
27
  Qwen2AudioEncoder,
28
28
  Qwen2AudioPreTrainedModel,
29
29
  )
30
30
  from ..voxtral.modeling_voxtral import VoxtralForConditionalGeneration, VoxtralMultiModalProjector
31
- from ..whisper.modeling_whisper import WhisperEncoderLayer
31
+ from ..whisper.modeling_whisper import WhisperAttention, WhisperEncoderLayer
32
32
  from .configuration_audioflamingo3 import AudioFlamingo3Config
33
33
 
34
34
 
35
35
  logger = logging.get_logger(__name__)
36
36
 
37
37
 
38
+ class AudioFlamingo3Attention(WhisperAttention):
39
+ pass
40
+
41
+
38
42
  class AudioFlamingo3EncoderLayer(WhisperEncoderLayer):
39
43
  pass
40
44
 
@@ -53,13 +57,18 @@ class AudioFlamingo3Encoder(Qwen2AudioEncoder):
53
57
  AudioFlamingo3 encoder: Whisper encoder, average pool (time/2), then LayerNorm.
54
58
  """
55
59
 
56
- @can_return_tuple
60
+ _can_record_outputs = {
61
+ "hidden_states": AudioFlamingo3EncoderLayer,
62
+ "attentions": AudioFlamingo3Attention,
63
+ }
64
+
65
+ @check_model_inputs
57
66
  def forward(
58
67
  self,
59
68
  input_features: torch.Tensor,
60
69
  input_features_mask: torch.Tensor | None = None,
61
70
  **kwargs,
62
- ):
71
+ ) -> tuple | BaseModelOutputWithPooling:
63
72
  r"""
64
73
  Args:
65
74
  input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
@@ -103,7 +112,7 @@ class AudioFlamingo3Encoder(Qwen2AudioEncoder):
103
112
  hidden_states = self.avg_pooler(hidden_states).permute(0, 2, 1)
104
113
  hidden_states = self.layer_norm(hidden_states)
105
114
 
106
- return BaseModelOutput(
115
+ return BaseModelOutputWithPooling(
107
116
  last_hidden_state=hidden_states,
108
117
  )
109
118
 
@@ -138,35 +147,40 @@ class AudioFlamingo3ForConditionalGeneration(VoxtralForConditionalGeneration):
138
147
  def __init__(self, config):
139
148
  super().__init__(config)
140
149
 
150
+ @can_return_tuple
151
+ @auto_docstring(
152
+ custom_intro="This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector."
153
+ )
141
154
  def get_audio_features(
142
- self, input_features: torch.FloatTensor, input_features_mask: torch.Tensor
143
- ) -> torch.FloatTensor:
144
- """
145
- This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
146
- Args:
147
- input_features (`torch.FloatTensor`):
148
- Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
149
- obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
150
- `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
151
- `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
152
- and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
153
- input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
154
- Mask to avoid performing attention on padded feature indices.
155
-
156
- Returns:
157
- `torch.FloatTensor`:
158
- The audio embeddings.
155
+ self,
156
+ input_features: torch.FloatTensor,
157
+ input_features_mask: torch.Tensor,
158
+ **kwargs: Unpack[TransformersKwargs],
159
+ ) -> tuple | BaseModelOutputWithPooling:
160
+ r"""
161
+ input_features (`torch.FloatTensor`):
162
+ Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
163
+ obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
164
+ `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
165
+ `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
166
+ and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
167
+ input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
168
+ Mask to avoid performing attention on padded feature indices.
159
169
  """
160
170
 
161
171
  # Encode audio
162
- encoder_output = self.audio_tower(input_features, input_features_mask=input_features_mask)
163
- audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
172
+ audio_output = self.audio_tower(
173
+ input_features, input_features_mask=input_features_mask, return_dict=True, **kwargs
174
+ )
175
+ audio_embeds = self.multi_modal_projector(audio_output.last_hidden_state)
164
176
 
165
177
  # Mask according to avg pooling (which is after attention blocks)
166
178
  post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1
167
179
  valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
168
180
  audio_embeds = audio_embeds[valid_mask.to(audio_embeds.device)]
169
- return audio_embeds
181
+ audio_output.pooler_output = audio_embeds
182
+
183
+ return audio_output
170
184
 
171
185
  @can_return_tuple
172
186
  @auto_docstring
@@ -252,7 +266,7 @@ class AudioFlamingo3ForConditionalGeneration(VoxtralForConditionalGeneration):
252
266
  inputs_embeds = self.get_input_embeddings()(input_ids)
253
267
 
254
268
  if input_features is not None and input_ids is not None:
255
- audio_embeds = self.get_audio_features(input_features, input_features_mask)
269
+ audio_embeds = self.get_audio_features(input_features, input_features_mask, return_dict=True).pooler_output
256
270
 
257
271
  # replace text-audio token placeholders with audio embeddings
258
272
  audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
@@ -21,6 +21,8 @@ from collections import OrderedDict
21
21
  from collections.abc import Iterator
22
22
  from typing import Any, TypeVar
23
23
 
24
+ from huggingface_hub import repo_exists
25
+
24
26
  from ...configuration_utils import PreTrainedConfig
25
27
  from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
26
28
  from ...utils import (
@@ -68,7 +70,7 @@ FROM_CONFIG_DOCSTRING = """
68
70
 
69
71
  List options
70
72
  attn_implementation (`str`, *optional*):
71
- The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), or `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation.
73
+ The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)), or `"flash_attention_3"` (using [Dao-AILab/flash-attention/hopper](https://github.com/Dao-AILab/flash-attention/tree/main/hopper)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation.
72
74
 
73
75
  Examples:
74
76
 
@@ -416,21 +418,21 @@ class _BaseAutoBackboneClass(_BaseAutoModelClass):
416
418
 
417
419
  num_channels = kwargs.pop("num_channels", config.num_channels)
418
420
  features_only = kwargs.pop("features_only", config.features_only)
419
- use_pretrained_backbone = kwargs.pop("use_pretrained_backbone", config.use_pretrained_backbone)
420
421
  out_indices = kwargs.pop("out_indices", config.out_indices)
421
422
  config = TimmBackboneConfig(
422
423
  backbone=pretrained_model_name_or_path,
423
424
  num_channels=num_channels,
424
425
  features_only=features_only,
425
- use_pretrained_backbone=use_pretrained_backbone,
426
426
  out_indices=out_indices,
427
427
  )
428
- return super().from_config(config, **kwargs)
428
+ # Always load a pretrained model when `from_pretrained` is called
429
+ kwargs.pop("use_pretrained_backbone", None)
430
+ return super().from_config(config, pretrained=True, **kwargs)
429
431
 
430
432
  @classmethod
431
433
  def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
432
- use_timm_backbone = kwargs.pop("use_timm_backbone", False)
433
- if use_timm_backbone:
434
+ kwargs.pop("use_timm_backbone", None)
435
+ if not repo_exists(pretrained_model_name_or_path):
434
436
  return cls._load_timm_backbone_from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
435
437
 
436
438
  return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
@@ -138,6 +138,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
138
138
  ("encodec", "EncodecConfig"),
139
139
  ("encoder-decoder", "EncoderDecoderConfig"),
140
140
  ("eomt", "EomtConfig"),
141
+ ("eomt_dinov3", "EomtDinov3Config"),
141
142
  ("ernie", "ErnieConfig"),
142
143
  ("ernie4_5", "Ernie4_5Config"),
143
144
  ("ernie4_5_moe", "Ernie4_5_MoeConfig"),
@@ -145,6 +146,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
145
146
  ("esm", "EsmConfig"),
146
147
  ("evolla", "EvollaConfig"),
147
148
  ("exaone4", "Exaone4Config"),
149
+ ("exaone_moe", "ExaoneMoeConfig"),
148
150
  ("falcon", "FalconConfig"),
149
151
  ("falcon_h1", "FalconH1Config"),
150
152
  ("falcon_mamba", "FalconMambaConfig"),
@@ -184,6 +186,9 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
184
186
  ("glm_image_text", "GlmImageTextConfig"),
185
187
  ("glm_image_vision", "GlmImageVisionConfig"),
186
188
  ("glm_image_vqmodel", "GlmImageVQVAEConfig"),
189
+ ("glm_ocr", "GlmOcrConfig"),
190
+ ("glm_ocr_text", "GlmOcrTextConfig"),
191
+ ("glm_ocr_vision", "GlmOcrVisionConfig"),
187
192
  ("glmasr", "GlmAsrConfig"),
188
193
  ("glmasr_encoder", "GlmAsrEncoderConfig"),
189
194
  ("glpn", "GLPNConfig"),
@@ -339,6 +344,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
339
344
  ("plbart", "PLBartConfig"),
340
345
  ("poolformer", "PoolFormerConfig"),
341
346
  ("pop2piano", "Pop2PianoConfig"),
347
+ ("pp_doclayout_v3", "PPDocLayoutV3Config"),
342
348
  ("prompt_depth_anything", "PromptDepthAnythingConfig"),
343
349
  ("prophetnet", "ProphetNetConfig"),
344
350
  ("pvt", "PvtConfig"),
@@ -403,6 +409,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
403
409
  ("smollm3", "SmolLM3Config"),
404
410
  ("smolvlm", "SmolVLMConfig"),
405
411
  ("smolvlm_vision", "SmolVLMVisionConfig"),
412
+ ("solar_open", "SolarOpenConfig"),
406
413
  ("speech-encoder-decoder", "SpeechEncoderDecoderConfig"),
407
414
  ("speech_to_text", "Speech2TextConfig"),
408
415
  ("speecht5", "SpeechT5Config"),
@@ -420,6 +427,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
420
427
  ("t5", "T5Config"),
421
428
  ("t5gemma", "T5GemmaConfig"),
422
429
  ("t5gemma2", "T5Gemma2Config"),
430
+ ("t5gemma2_encoder", "T5Gemma2EncoderConfig"),
423
431
  ("table-transformer", "TableTransformerConfig"),
424
432
  ("tapas", "TapasConfig"),
425
433
  ("textnet", "TextNetConfig"),
@@ -474,6 +482,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
474
482
  ("xmod", "XmodConfig"),
475
483
  ("yolos", "YolosConfig"),
476
484
  ("yoso", "YosoConfig"),
485
+ ("youtu", "YoutuConfig"),
477
486
  ("zamba", "ZambaConfig"),
478
487
  ("zamba2", "Zamba2Config"),
479
488
  ("zoedepth", "ZoeDepthConfig"),
@@ -598,6 +607,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
598
607
  ("encodec", "EnCodec"),
599
608
  ("encoder-decoder", "Encoder decoder"),
600
609
  ("eomt", "EoMT"),
610
+ ("eomt_dinov3", "EoMT-DINOv3"),
601
611
  ("ernie", "ERNIE"),
602
612
  ("ernie4_5", "Ernie4_5"),
603
613
  ("ernie4_5_moe", "Ernie4_5_MoE"),
@@ -605,6 +615,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
605
615
  ("esm", "ESM"),
606
616
  ("evolla", "Evolla"),
607
617
  ("exaone4", "EXAONE-4.0"),
618
+ ("exaone_moe", "EXAONE-MoE"),
608
619
  ("falcon", "Falcon"),
609
620
  ("falcon3", "Falcon3"),
610
621
  ("falcon_h1", "FalconH1"),
@@ -647,6 +658,9 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
647
658
  ("glm_image_text", "GlmImageText"),
648
659
  ("glm_image_vision", "GlmImageVisionModel"),
649
660
  ("glm_image_vqmodel", "GlmImageVQVAE"),
661
+ ("glm_ocr", "Glmocr"),
662
+ ("glm_ocr_text", "GlmOcrText"),
663
+ ("glm_ocr_vision", "GlmOcrVisionModel"),
650
664
  ("glmasr", "GLM-ASR"),
651
665
  ("glmasr_encoder", "GLM-ASR Encoder"),
652
666
  ("glpn", "GLPN"),
@@ -816,6 +830,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
816
830
  ("plbart", "PLBart"),
817
831
  ("poolformer", "PoolFormer"),
818
832
  ("pop2piano", "Pop2Piano"),
833
+ ("pp_doclayout_v3", "PPDocLayoutV3"),
819
834
  ("prompt_depth_anything", "PromptDepthAnything"),
820
835
  ("prophetnet", "ProphetNet"),
821
836
  ("pvt", "PVT"),
@@ -880,6 +895,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
880
895
  ("smollm3", "SmolLM3"),
881
896
  ("smolvlm", "SmolVLM"),
882
897
  ("smolvlm_vision", "SmolVLMVisionTransformer"),
898
+ ("solar_open", "SolarOpen"),
883
899
  ("speech-encoder-decoder", "Speech Encoder decoder"),
884
900
  ("speech_to_text", "Speech2Text"),
885
901
  ("speecht5", "SpeechT5"),
@@ -897,6 +913,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
897
913
  ("t5", "T5"),
898
914
  ("t5gemma", "T5Gemma"),
899
915
  ("t5gemma2", "T5Gemma2"),
916
+ ("t5gemma2_encoder", "T5Gemma2Encoder"),
900
917
  ("t5v1.1", "T5v1.1"),
901
918
  ("table-transformer", "Table Transformer"),
902
919
  ("tapas", "TAPAS"),
@@ -957,6 +974,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
957
974
  ("xmod", "X-MOD"),
958
975
  ("yolos", "YOLOS"),
959
976
  ("yoso", "YOSO"),
977
+ ("youtu", "Youtu"),
960
978
  ("zamba", "Zamba"),
961
979
  ("zamba2", "Zamba2"),
962
980
  ("zoedepth", "ZoeDepth"),
@@ -997,6 +1015,9 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
997
1015
  ("glm_image_vision", "glm_image"),
998
1016
  ("glm_image_vqmodel", "glm_image"),
999
1017
  ("glm_image_text", "glm_image"),
1018
+ ("glm_ocr_vision", "glm_ocr"),
1019
+ ("glm_ocr_vqmodel", "glm_ocr"),
1020
+ ("glm_ocr_text", "glm_ocr"),
1000
1021
  ("glmasr_encoder", "glmasr"),
1001
1022
  ("grounding-dino", "grounding_dino"),
1002
1023
  ("mm-grounding-dino", "mm_grounding_dino"),
@@ -1021,6 +1042,7 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
1021
1042
  ("sam3_vision_model", "sam3"),
1022
1043
  ("edgetam_vision_model", "edgetam"),
1023
1044
  ("sam_hq_vision_model", "sam_hq"),
1045
+ ("t5gemma2_encoder", "t5gemma2"),
1024
1046
  ("llama4_text", "llama4"),
1025
1047
  ("blip_2_qformer", "blip_2"),
1026
1048
  ("fastspeech2_conformer_with_hifigan", "fastspeech2_conformer"),
@@ -47,9 +47,14 @@ from .configuration_auto import (
47
47
 
48
48
  logger = logging.get_logger(__name__)
49
49
 
50
-
51
- FORCE_FAST_IMAGE_PROCESSOR = ["Qwen2VLImageProcessor"]
52
-
50
+ # These image processors use Lanczos interpolation, which is not supported by fast image processors.
51
+ # To avoid important differences in outputs, we default to using the slow image processors for these processors.
52
+ DEFAULT_TO_SLOW_IMAGE_PROCESSORS = [
53
+ "ChameleonImageProcessor",
54
+ "FlavaImageProcessor",
55
+ "Idefics3ImageProcessor",
56
+ "SmolVLMImageProcessor",
57
+ ]
53
58
 
54
59
  if TYPE_CHECKING:
55
60
  # This significantly improves completion suggestion performance when
@@ -98,6 +103,7 @@ else:
98
103
  ("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
99
104
  ("emu3", ("Emu3ImageProcessor", None)),
100
105
  ("eomt", ("EomtImageProcessor", "EomtImageProcessorFast")),
106
+ ("eomt_dinov3", ("EomtImageProcessor", "EomtImageProcessorFast")),
101
107
  ("ernie4_5_vl_moe", ("Ernie4_5_VL_MoeImageProcessor", "Ernie4_5_VL_MoeImageProcessorFast")),
102
108
  ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")),
103
109
  ("florence2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
@@ -164,6 +170,7 @@ else:
164
170
  ("pixio", ("BitImageProcessor", "BitImageProcessorFast")),
165
171
  ("pixtral", ("PixtralImageProcessor", "PixtralImageProcessorFast")),
166
172
  ("poolformer", ("PoolFormerImageProcessor", "PoolFormerImageProcessorFast")),
173
+ ("pp_doclayout_v3", (None, "PPDocLayoutV3ImageProcessorFast")),
167
174
  ("prompt_depth_anything", ("PromptDepthAnythingImageProcessor", "PromptDepthAnythingImageProcessorFast")),
168
175
  ("pvt", ("PvtImageProcessor", "PvtImageProcessorFast")),
169
176
  ("pvt_v2", ("PvtImageProcessor", "PvtImageProcessorFast")),
@@ -196,6 +203,7 @@ else:
196
203
  ("swin2sr", ("Swin2SRImageProcessor", "Swin2SRImageProcessorFast")),
197
204
  ("swinv2", ("ViTImageProcessor", "ViTImageProcessorFast")),
198
205
  ("t5gemma2", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
206
+ ("t5gemma2_encoder", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
199
207
  ("table-transformer", ("DetrImageProcessor", "DetrImageProcessorFast")),
200
208
  ("textnet", ("TextNetImageProcessor", "TextNetImageProcessorFast")),
201
209
  ("timesformer", ("VideoMAEImageProcessor", None)),
@@ -535,24 +543,20 @@ class AutoImageProcessor:
535
543
  image_processor_auto_map = config.auto_map["AutoImageProcessor"]
536
544
 
537
545
  image_processor_class = None
538
- # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
539
546
  if image_processor_type is not None:
540
547
  # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
541
548
  if use_fast is None:
542
549
  use_fast = image_processor_type.endswith("Fast")
543
- if not use_fast and image_processor_type in FORCE_FAST_IMAGE_PROCESSOR and is_torchvision_available():
544
- use_fast = True
550
+ if (
551
+ not use_fast
552
+ and is_torchvision_available()
553
+ and image_processor_type not in DEFAULT_TO_SLOW_IMAGE_PROCESSORS
554
+ ):
545
555
  logger.warning_once(
546
556
  f"The image processor of type `{image_processor_type}` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. "
547
557
  "This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. "
548
- "Note that this behavior will be extended to all models in a future release."
549
- )
550
- if not use_fast:
551
- logger.warning_once(
552
- "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
553
- "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
554
- "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
555
558
  )
559
+ use_fast = True
556
560
  if use_fast and not image_processor_type.endswith("Fast"):
557
561
  image_processor_type += "Fast"
558
562
  if use_fast and not is_torchvision_available():
@@ -147,6 +147,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
147
147
  ("esm", "EsmModel"),
148
148
  ("evolla", "EvollaModel"),
149
149
  ("exaone4", "Exaone4Model"),
150
+ ("exaone_moe", "ExaoneMoeModel"),
150
151
  ("falcon", "FalconModel"),
151
152
  ("falcon_h1", "FalconH1Model"),
152
153
  ("falcon_mamba", "FalconMambaModel"),
@@ -186,6 +187,9 @@ MODEL_MAPPING_NAMES = OrderedDict(
186
187
  ("glm_image_text", "GlmImageTextModel"),
187
188
  ("glm_image_vision", "GlmImageVisionModel"),
188
189
  ("glm_image_vqmodel", "GlmImageVQVAE"),
190
+ ("glm_ocr", "GlmOcrModel"),
191
+ ("glm_ocr_text", "GlmOcrTextModel"),
192
+ ("glm_ocr_vision", "GlmOcrVisionModel"),
189
193
  ("glmasr", "GlmAsrForConditionalGeneration"),
190
194
  ("glmasr_encoder", "GlmAsrEncoder"),
191
195
  ("glpn", "GLPNModel"),
@@ -333,6 +337,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
333
337
  ("pixtral", "PixtralVisionModel"),
334
338
  ("plbart", "PLBartModel"),
335
339
  ("poolformer", "PoolFormerModel"),
340
+ ("pp_doclayout_v3", "PPDocLayoutV3Model"),
336
341
  ("prophetnet", "ProphetNetModel"),
337
342
  ("pvt", "PvtModel"),
338
343
  ("pvt_v2", "PvtV2Model"),
@@ -391,6 +396,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
391
396
  ("smollm3", "SmolLM3Model"),
392
397
  ("smolvlm", "SmolVLMModel"),
393
398
  ("smolvlm_vision", "SmolVLMVisionTransformer"),
399
+ ("solar_open", "SolarOpenModel"),
394
400
  ("speech_to_text", "Speech2TextModel"),
395
401
  ("speecht5", "SpeechT5Model"),
396
402
  ("splinter", "SplinterModel"),
@@ -405,6 +411,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
405
411
  ("t5", "T5Model"),
406
412
  ("t5gemma", "T5GemmaModel"),
407
413
  ("t5gemma2", "T5Gemma2Model"),
414
+ ("t5gemma2_encoder", "T5Gemma2Encoder"),
408
415
  ("table-transformer", "TableTransformerModel"),
409
416
  ("tapas", "TapasModel"),
410
417
  ("textnet", "TextNetModel"),
@@ -453,6 +460,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
453
460
  ("xmod", "XmodModel"),
454
461
  ("yolos", "YolosModel"),
455
462
  ("yoso", "YosoModel"),
463
+ ("youtu", "YoutuModel"),
456
464
  ("zamba", "ZambaModel"),
457
465
  ("zamba2", "Zamba2Model"),
458
466
  ]
@@ -479,6 +487,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
479
487
  ("ernie", "ErnieForPreTraining"),
480
488
  ("evolla", "EvollaForProteinText2Text"),
481
489
  ("exaone4", "Exaone4ForCausalLM"),
490
+ ("exaone_moe", "ExaoneMoeForCausalLM"),
482
491
  ("falcon_mamba", "FalconMambaForCausalLM"),
483
492
  ("flaubert", "FlaubertWithLMHeadModel"),
484
493
  ("flava", "FlavaForPreTraining"),
@@ -590,6 +599,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
590
599
  ("ernie4_5", "Ernie4_5ForCausalLM"),
591
600
  ("ernie4_5_moe", "Ernie4_5_MoeForCausalLM"),
592
601
  ("exaone4", "Exaone4ForCausalLM"),
602
+ ("exaone_moe", "ExaoneMoeForCausalLM"),
593
603
  ("falcon", "FalconForCausalLM"),
594
604
  ("falcon_h1", "FalconH1ForCausalLM"),
595
605
  ("falcon_mamba", "FalconMambaForCausalLM"),
@@ -680,6 +690,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
680
690
  ("rwkv", "RwkvForCausalLM"),
681
691
  ("seed_oss", "SeedOssForCausalLM"),
682
692
  ("smollm3", "SmolLM3ForCausalLM"),
693
+ ("solar_open", "SolarOpenForCausalLM"),
683
694
  ("stablelm", "StableLmForCausalLM"),
684
695
  ("starcoder2", "Starcoder2ForCausalLM"),
685
696
  ("trocr", "TrOCRForCausalLM"),
@@ -692,6 +703,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
692
703
  ("xlnet", "XLNetLMHeadModel"),
693
704
  ("xlstm", "xLSTMForCausalLM"),
694
705
  ("xmod", "XmodForCausalLM"),
706
+ ("youtu", "YoutuForCausalLM"),
695
707
  ("zamba", "ZambaForCausalLM"),
696
708
  ("zamba2", "Zamba2ForCausalLM"),
697
709
  ]
@@ -871,6 +883,7 @@ MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = OrderedDict(
871
883
  # Model for Universal Segmentation mapping
872
884
  ("detr", "DetrForSegmentation"),
873
885
  ("eomt", "EomtForUniversalSegmentation"),
886
+ ("eomt_dinov3", "EomtDinov3ForUniversalSegmentation"),
874
887
  ("mask2former", "Mask2FormerForUniversalSegmentation"),
875
888
  ("maskformer", "MaskFormerForInstanceSegmentation"),
876
889
  ("oneformer", "OneFormerForUniversalSegmentation"),
@@ -914,6 +927,7 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
914
927
  ("glm46v", "Glm46VForConditionalGeneration"),
915
928
  ("glm4v", "Glm4vForConditionalGeneration"),
916
929
  ("glm4v_moe", "Glm4vMoeForConditionalGeneration"),
930
+ ("glm_ocr", "GlmOcrForConditionalGeneration"),
917
931
  ("got_ocr2", "GotOcr2ForConditionalGeneration"),
918
932
  ("idefics", "IdeficsForVisionText2Text"),
919
933
  ("idefics2", "Idefics2ForConditionalGeneration"),
@@ -1028,6 +1042,7 @@ MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict(
1028
1042
  ("deformable_detr", "DeformableDetrForObjectDetection"),
1029
1043
  ("detr", "DetrForObjectDetection"),
1030
1044
  ("lw_detr", "LwDetrForObjectDetection"),
1045
+ ("pp_doclayout_v3", "PPDocLayoutV3ForObjectDetection"),
1031
1046
  ("rt_detr", "RTDetrForObjectDetection"),
1032
1047
  ("rt_detr_v2", "RTDetrV2ForObjectDetection"),
1033
1048
  ("table-transformer", "TableTransformerForObjectDetection"),