transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1021) hide show
  1. transformers/__init__.py +4 -11
  2. transformers/activations.py +2 -2
  3. transformers/backbone_utils.py +326 -0
  4. transformers/cache_utils.py +11 -2
  5. transformers/cli/serve.py +11 -8
  6. transformers/configuration_utils.py +1 -69
  7. transformers/conversion_mapping.py +146 -26
  8. transformers/convert_slow_tokenizer.py +6 -4
  9. transformers/core_model_loading.py +207 -118
  10. transformers/dependency_versions_check.py +0 -1
  11. transformers/dependency_versions_table.py +7 -8
  12. transformers/file_utils.py +0 -2
  13. transformers/generation/candidate_generator.py +1 -2
  14. transformers/generation/continuous_batching/cache.py +40 -38
  15. transformers/generation/continuous_batching/cache_manager.py +3 -16
  16. transformers/generation/continuous_batching/continuous_api.py +94 -406
  17. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  18. transformers/generation/continuous_batching/requests.py +54 -17
  19. transformers/generation/continuous_batching/scheduler.py +77 -95
  20. transformers/generation/logits_process.py +10 -5
  21. transformers/generation/stopping_criteria.py +1 -2
  22. transformers/generation/utils.py +75 -95
  23. transformers/image_processing_utils.py +0 -3
  24. transformers/image_processing_utils_fast.py +17 -18
  25. transformers/image_transforms.py +44 -13
  26. transformers/image_utils.py +0 -5
  27. transformers/initialization.py +57 -0
  28. transformers/integrations/__init__.py +10 -24
  29. transformers/integrations/accelerate.py +47 -11
  30. transformers/integrations/deepspeed.py +145 -3
  31. transformers/integrations/executorch.py +2 -6
  32. transformers/integrations/finegrained_fp8.py +142 -7
  33. transformers/integrations/flash_attention.py +2 -7
  34. transformers/integrations/hub_kernels.py +18 -7
  35. transformers/integrations/moe.py +226 -106
  36. transformers/integrations/mxfp4.py +47 -34
  37. transformers/integrations/peft.py +488 -176
  38. transformers/integrations/tensor_parallel.py +641 -581
  39. transformers/masking_utils.py +153 -9
  40. transformers/modeling_flash_attention_utils.py +1 -2
  41. transformers/modeling_utils.py +359 -358
  42. transformers/models/__init__.py +6 -0
  43. transformers/models/afmoe/configuration_afmoe.py +14 -4
  44. transformers/models/afmoe/modeling_afmoe.py +8 -8
  45. transformers/models/afmoe/modular_afmoe.py +7 -7
  46. transformers/models/aimv2/configuration_aimv2.py +2 -7
  47. transformers/models/aimv2/modeling_aimv2.py +26 -24
  48. transformers/models/aimv2/modular_aimv2.py +8 -12
  49. transformers/models/albert/configuration_albert.py +8 -1
  50. transformers/models/albert/modeling_albert.py +3 -3
  51. transformers/models/align/configuration_align.py +8 -5
  52. transformers/models/align/modeling_align.py +22 -24
  53. transformers/models/altclip/configuration_altclip.py +4 -6
  54. transformers/models/altclip/modeling_altclip.py +30 -26
  55. transformers/models/apertus/configuration_apertus.py +5 -7
  56. transformers/models/apertus/modeling_apertus.py +4 -4
  57. transformers/models/apertus/modular_apertus.py +8 -10
  58. transformers/models/arcee/configuration_arcee.py +5 -7
  59. transformers/models/arcee/modeling_arcee.py +4 -4
  60. transformers/models/aria/configuration_aria.py +11 -21
  61. transformers/models/aria/modeling_aria.py +39 -36
  62. transformers/models/aria/modular_aria.py +33 -39
  63. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
  64. transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
  65. transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
  66. transformers/models/auto/auto_factory.py +8 -6
  67. transformers/models/auto/configuration_auto.py +22 -0
  68. transformers/models/auto/image_processing_auto.py +17 -13
  69. transformers/models/auto/modeling_auto.py +15 -0
  70. transformers/models/auto/processing_auto.py +9 -18
  71. transformers/models/auto/tokenization_auto.py +17 -15
  72. transformers/models/autoformer/modeling_autoformer.py +2 -1
  73. transformers/models/aya_vision/configuration_aya_vision.py +4 -0
  74. transformers/models/aya_vision/modeling_aya_vision.py +29 -62
  75. transformers/models/aya_vision/modular_aya_vision.py +20 -45
  76. transformers/models/bamba/configuration_bamba.py +17 -7
  77. transformers/models/bamba/modeling_bamba.py +23 -55
  78. transformers/models/bamba/modular_bamba.py +19 -54
  79. transformers/models/bark/configuration_bark.py +2 -1
  80. transformers/models/bark/modeling_bark.py +24 -10
  81. transformers/models/bart/configuration_bart.py +9 -4
  82. transformers/models/bart/modeling_bart.py +9 -12
  83. transformers/models/beit/configuration_beit.py +2 -4
  84. transformers/models/beit/image_processing_beit_fast.py +3 -3
  85. transformers/models/beit/modeling_beit.py +14 -9
  86. transformers/models/bert/configuration_bert.py +12 -1
  87. transformers/models/bert/modeling_bert.py +6 -30
  88. transformers/models/bert_generation/configuration_bert_generation.py +17 -1
  89. transformers/models/bert_generation/modeling_bert_generation.py +6 -6
  90. transformers/models/big_bird/configuration_big_bird.py +12 -8
  91. transformers/models/big_bird/modeling_big_bird.py +0 -15
  92. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
  93. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
  94. transformers/models/biogpt/configuration_biogpt.py +8 -1
  95. transformers/models/biogpt/modeling_biogpt.py +4 -8
  96. transformers/models/biogpt/modular_biogpt.py +1 -5
  97. transformers/models/bit/configuration_bit.py +2 -4
  98. transformers/models/bit/modeling_bit.py +6 -5
  99. transformers/models/bitnet/configuration_bitnet.py +5 -7
  100. transformers/models/bitnet/modeling_bitnet.py +3 -4
  101. transformers/models/bitnet/modular_bitnet.py +3 -4
  102. transformers/models/blenderbot/configuration_blenderbot.py +8 -4
  103. transformers/models/blenderbot/modeling_blenderbot.py +4 -4
  104. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
  105. transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
  106. transformers/models/blip/configuration_blip.py +9 -9
  107. transformers/models/blip/modeling_blip.py +55 -37
  108. transformers/models/blip_2/configuration_blip_2.py +2 -1
  109. transformers/models/blip_2/modeling_blip_2.py +81 -56
  110. transformers/models/bloom/configuration_bloom.py +5 -1
  111. transformers/models/bloom/modeling_bloom.py +2 -1
  112. transformers/models/blt/configuration_blt.py +23 -12
  113. transformers/models/blt/modeling_blt.py +20 -14
  114. transformers/models/blt/modular_blt.py +70 -10
  115. transformers/models/bridgetower/configuration_bridgetower.py +7 -1
  116. transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
  117. transformers/models/bridgetower/modeling_bridgetower.py +29 -15
  118. transformers/models/bros/configuration_bros.py +24 -17
  119. transformers/models/camembert/configuration_camembert.py +8 -1
  120. transformers/models/camembert/modeling_camembert.py +6 -6
  121. transformers/models/canine/configuration_canine.py +4 -1
  122. transformers/models/chameleon/configuration_chameleon.py +5 -7
  123. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
  124. transformers/models/chameleon/modeling_chameleon.py +82 -36
  125. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
  126. transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
  127. transformers/models/clap/configuration_clap.py +4 -8
  128. transformers/models/clap/modeling_clap.py +21 -22
  129. transformers/models/clip/configuration_clip.py +4 -1
  130. transformers/models/clip/image_processing_clip_fast.py +9 -0
  131. transformers/models/clip/modeling_clip.py +25 -22
  132. transformers/models/clipseg/configuration_clipseg.py +4 -1
  133. transformers/models/clipseg/modeling_clipseg.py +27 -25
  134. transformers/models/clipseg/processing_clipseg.py +11 -3
  135. transformers/models/clvp/configuration_clvp.py +14 -2
  136. transformers/models/clvp/modeling_clvp.py +19 -30
  137. transformers/models/codegen/configuration_codegen.py +4 -3
  138. transformers/models/codegen/modeling_codegen.py +2 -1
  139. transformers/models/cohere/configuration_cohere.py +5 -7
  140. transformers/models/cohere/modeling_cohere.py +4 -4
  141. transformers/models/cohere/modular_cohere.py +3 -3
  142. transformers/models/cohere2/configuration_cohere2.py +6 -8
  143. transformers/models/cohere2/modeling_cohere2.py +4 -4
  144. transformers/models/cohere2/modular_cohere2.py +9 -11
  145. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  146. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
  147. transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
  148. transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
  149. transformers/models/colqwen2/modeling_colqwen2.py +7 -6
  150. transformers/models/colqwen2/modular_colqwen2.py +7 -6
  151. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
  152. transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
  153. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
  154. transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
  155. transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
  156. transformers/models/convbert/configuration_convbert.py +11 -7
  157. transformers/models/convnext/configuration_convnext.py +2 -4
  158. transformers/models/convnext/image_processing_convnext_fast.py +2 -2
  159. transformers/models/convnext/modeling_convnext.py +7 -6
  160. transformers/models/convnextv2/configuration_convnextv2.py +2 -4
  161. transformers/models/convnextv2/modeling_convnextv2.py +7 -6
  162. transformers/models/cpmant/configuration_cpmant.py +4 -0
  163. transformers/models/csm/configuration_csm.py +9 -15
  164. transformers/models/csm/modeling_csm.py +3 -3
  165. transformers/models/ctrl/configuration_ctrl.py +16 -0
  166. transformers/models/ctrl/modeling_ctrl.py +13 -25
  167. transformers/models/cwm/configuration_cwm.py +5 -7
  168. transformers/models/cwm/modeling_cwm.py +4 -4
  169. transformers/models/d_fine/configuration_d_fine.py +10 -56
  170. transformers/models/d_fine/modeling_d_fine.py +728 -868
  171. transformers/models/d_fine/modular_d_fine.py +335 -412
  172. transformers/models/dab_detr/configuration_dab_detr.py +22 -48
  173. transformers/models/dab_detr/modeling_dab_detr.py +11 -7
  174. transformers/models/dac/modeling_dac.py +1 -1
  175. transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
  176. transformers/models/data2vec/configuration_data2vec_text.py +11 -2
  177. transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
  178. transformers/models/data2vec/modeling_data2vec_text.py +6 -6
  179. transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
  180. transformers/models/dbrx/configuration_dbrx.py +11 -3
  181. transformers/models/dbrx/modeling_dbrx.py +6 -6
  182. transformers/models/dbrx/modular_dbrx.py +6 -6
  183. transformers/models/deberta/configuration_deberta.py +6 -0
  184. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
  185. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
  186. transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
  187. transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
  188. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
  189. transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
  190. transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
  191. transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
  192. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
  193. transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
  194. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
  195. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
  196. transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
  197. transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
  198. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
  199. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
  200. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
  201. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
  202. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
  203. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
  204. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
  205. transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
  206. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
  207. transformers/models/deit/modeling_deit.py +11 -7
  208. transformers/models/depth_anything/configuration_depth_anything.py +12 -42
  209. transformers/models/depth_anything/modeling_depth_anything.py +5 -3
  210. transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
  211. transformers/models/depth_pro/modeling_depth_pro.py +8 -4
  212. transformers/models/detr/configuration_detr.py +18 -49
  213. transformers/models/detr/image_processing_detr_fast.py +11 -11
  214. transformers/models/detr/modeling_detr.py +695 -734
  215. transformers/models/dia/configuration_dia.py +4 -7
  216. transformers/models/dia/generation_dia.py +8 -17
  217. transformers/models/dia/modeling_dia.py +7 -7
  218. transformers/models/dia/modular_dia.py +4 -4
  219. transformers/models/diffllama/configuration_diffllama.py +5 -7
  220. transformers/models/diffllama/modeling_diffllama.py +3 -8
  221. transformers/models/diffllama/modular_diffllama.py +2 -7
  222. transformers/models/dinat/configuration_dinat.py +2 -4
  223. transformers/models/dinat/modeling_dinat.py +7 -6
  224. transformers/models/dinov2/configuration_dinov2.py +2 -4
  225. transformers/models/dinov2/modeling_dinov2.py +9 -8
  226. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
  227. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
  228. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
  229. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
  230. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
  231. transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
  232. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
  233. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
  234. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
  235. transformers/models/distilbert/configuration_distilbert.py +8 -1
  236. transformers/models/distilbert/modeling_distilbert.py +3 -3
  237. transformers/models/doge/configuration_doge.py +17 -7
  238. transformers/models/doge/modeling_doge.py +4 -4
  239. transformers/models/doge/modular_doge.py +20 -10
  240. transformers/models/donut/image_processing_donut_fast.py +4 -4
  241. transformers/models/dots1/configuration_dots1.py +16 -7
  242. transformers/models/dots1/modeling_dots1.py +4 -4
  243. transformers/models/dpr/configuration_dpr.py +19 -1
  244. transformers/models/dpt/configuration_dpt.py +23 -65
  245. transformers/models/dpt/image_processing_dpt_fast.py +5 -5
  246. transformers/models/dpt/modeling_dpt.py +19 -15
  247. transformers/models/dpt/modular_dpt.py +4 -4
  248. transformers/models/edgetam/configuration_edgetam.py +1 -1
  249. transformers/models/edgetam/modeling_edgetam.py +53 -53
  250. transformers/models/edgetam/modular_edgetam.py +5 -7
  251. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
  252. transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
  253. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
  254. transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
  255. transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
  256. transformers/models/electra/configuration_electra.py +13 -2
  257. transformers/models/electra/modeling_electra.py +6 -6
  258. transformers/models/emu3/configuration_emu3.py +12 -10
  259. transformers/models/emu3/modeling_emu3.py +84 -47
  260. transformers/models/emu3/modular_emu3.py +77 -39
  261. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
  262. transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
  263. transformers/models/eomt/configuration_eomt.py +12 -13
  264. transformers/models/eomt/image_processing_eomt_fast.py +3 -3
  265. transformers/models/eomt/modeling_eomt.py +3 -3
  266. transformers/models/eomt/modular_eomt.py +17 -17
  267. transformers/models/eomt_dinov3/__init__.py +28 -0
  268. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  269. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  270. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  271. transformers/models/ernie/configuration_ernie.py +24 -2
  272. transformers/models/ernie/modeling_ernie.py +6 -30
  273. transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
  274. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  275. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
  276. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
  277. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
  278. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
  279. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
  280. transformers/models/esm/configuration_esm.py +9 -11
  281. transformers/models/esm/modeling_esm.py +3 -3
  282. transformers/models/esm/modeling_esmfold.py +1 -6
  283. transformers/models/esm/openfold_utils/protein.py +2 -3
  284. transformers/models/evolla/configuration_evolla.py +21 -8
  285. transformers/models/evolla/modeling_evolla.py +11 -7
  286. transformers/models/evolla/modular_evolla.py +5 -1
  287. transformers/models/exaone4/configuration_exaone4.py +8 -5
  288. transformers/models/exaone4/modeling_exaone4.py +4 -4
  289. transformers/models/exaone4/modular_exaone4.py +11 -8
  290. transformers/models/exaone_moe/__init__.py +27 -0
  291. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  292. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  293. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  294. transformers/models/falcon/configuration_falcon.py +9 -1
  295. transformers/models/falcon/modeling_falcon.py +3 -8
  296. transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
  297. transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
  298. transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
  299. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
  300. transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
  301. transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
  302. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
  303. transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
  304. transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
  305. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
  306. transformers/models/flaubert/configuration_flaubert.py +10 -4
  307. transformers/models/flaubert/modeling_flaubert.py +1 -1
  308. transformers/models/flava/configuration_flava.py +4 -3
  309. transformers/models/flava/image_processing_flava_fast.py +4 -4
  310. transformers/models/flava/modeling_flava.py +36 -28
  311. transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
  312. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
  313. transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
  314. transformers/models/florence2/configuration_florence2.py +4 -0
  315. transformers/models/florence2/modeling_florence2.py +57 -32
  316. transformers/models/florence2/modular_florence2.py +48 -26
  317. transformers/models/fnet/configuration_fnet.py +6 -1
  318. transformers/models/focalnet/configuration_focalnet.py +2 -4
  319. transformers/models/focalnet/modeling_focalnet.py +10 -7
  320. transformers/models/fsmt/configuration_fsmt.py +12 -16
  321. transformers/models/funnel/configuration_funnel.py +8 -0
  322. transformers/models/fuyu/configuration_fuyu.py +5 -8
  323. transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
  324. transformers/models/fuyu/modeling_fuyu.py +24 -23
  325. transformers/models/gemma/configuration_gemma.py +5 -7
  326. transformers/models/gemma/modeling_gemma.py +4 -4
  327. transformers/models/gemma/modular_gemma.py +5 -7
  328. transformers/models/gemma2/configuration_gemma2.py +5 -7
  329. transformers/models/gemma2/modeling_gemma2.py +4 -4
  330. transformers/models/gemma2/modular_gemma2.py +8 -10
  331. transformers/models/gemma3/configuration_gemma3.py +28 -22
  332. transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
  333. transformers/models/gemma3/modeling_gemma3.py +37 -33
  334. transformers/models/gemma3/modular_gemma3.py +46 -42
  335. transformers/models/gemma3n/configuration_gemma3n.py +35 -22
  336. transformers/models/gemma3n/modeling_gemma3n.py +86 -58
  337. transformers/models/gemma3n/modular_gemma3n.py +112 -75
  338. transformers/models/git/configuration_git.py +5 -7
  339. transformers/models/git/modeling_git.py +31 -41
  340. transformers/models/glm/configuration_glm.py +7 -9
  341. transformers/models/glm/modeling_glm.py +4 -4
  342. transformers/models/glm4/configuration_glm4.py +7 -9
  343. transformers/models/glm4/modeling_glm4.py +4 -4
  344. transformers/models/glm46v/configuration_glm46v.py +4 -0
  345. transformers/models/glm46v/image_processing_glm46v.py +5 -2
  346. transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
  347. transformers/models/glm46v/modeling_glm46v.py +91 -46
  348. transformers/models/glm46v/modular_glm46v.py +4 -0
  349. transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
  350. transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
  351. transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
  352. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
  353. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
  354. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
  355. transformers/models/glm4v/configuration_glm4v.py +12 -8
  356. transformers/models/glm4v/image_processing_glm4v.py +5 -2
  357. transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
  358. transformers/models/glm4v/modeling_glm4v.py +120 -63
  359. transformers/models/glm4v/modular_glm4v.py +82 -50
  360. transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
  361. transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
  362. transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
  363. transformers/models/glm_image/configuration_glm_image.py +26 -20
  364. transformers/models/glm_image/image_processing_glm_image.py +1 -1
  365. transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
  366. transformers/models/glm_image/modeling_glm_image.py +337 -236
  367. transformers/models/glm_image/modular_glm_image.py +415 -255
  368. transformers/models/glm_image/processing_glm_image.py +65 -17
  369. transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
  370. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  371. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  372. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  373. transformers/models/glmasr/modeling_glmasr.py +34 -28
  374. transformers/models/glmasr/modular_glmasr.py +23 -11
  375. transformers/models/glpn/image_processing_glpn_fast.py +3 -3
  376. transformers/models/glpn/modeling_glpn.py +4 -2
  377. transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
  378. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
  379. transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
  380. transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
  381. transformers/models/gpt2/configuration_gpt2.py +13 -1
  382. transformers/models/gpt2/modeling_gpt2.py +5 -5
  383. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
  384. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
  385. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
  386. transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
  387. transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
  388. transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
  389. transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
  390. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
  391. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
  392. transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
  393. transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
  394. transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
  395. transformers/models/gptj/configuration_gptj.py +4 -4
  396. transformers/models/gptj/modeling_gptj.py +3 -7
  397. transformers/models/granite/configuration_granite.py +5 -7
  398. transformers/models/granite/modeling_granite.py +4 -4
  399. transformers/models/granite_speech/modeling_granite_speech.py +63 -37
  400. transformers/models/granitemoe/configuration_granitemoe.py +5 -7
  401. transformers/models/granitemoe/modeling_granitemoe.py +4 -4
  402. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
  403. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
  404. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
  405. transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
  406. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
  407. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
  408. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
  409. transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
  410. transformers/models/groupvit/configuration_groupvit.py +4 -1
  411. transformers/models/groupvit/modeling_groupvit.py +29 -22
  412. transformers/models/helium/configuration_helium.py +5 -7
  413. transformers/models/helium/modeling_helium.py +4 -4
  414. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
  415. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
  416. transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
  417. transformers/models/hiera/configuration_hiera.py +2 -4
  418. transformers/models/hiera/modeling_hiera.py +11 -8
  419. transformers/models/hubert/configuration_hubert.py +4 -1
  420. transformers/models/hubert/modeling_hubert.py +7 -4
  421. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
  422. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
  423. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
  424. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
  425. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
  426. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
  427. transformers/models/ibert/configuration_ibert.py +4 -1
  428. transformers/models/idefics/configuration_idefics.py +5 -7
  429. transformers/models/idefics/modeling_idefics.py +3 -4
  430. transformers/models/idefics/vision.py +5 -4
  431. transformers/models/idefics2/configuration_idefics2.py +1 -2
  432. transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
  433. transformers/models/idefics2/modeling_idefics2.py +72 -50
  434. transformers/models/idefics3/configuration_idefics3.py +1 -3
  435. transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
  436. transformers/models/idefics3/modeling_idefics3.py +63 -40
  437. transformers/models/ijepa/modeling_ijepa.py +3 -3
  438. transformers/models/imagegpt/configuration_imagegpt.py +9 -1
  439. transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
  440. transformers/models/imagegpt/modeling_imagegpt.py +8 -4
  441. transformers/models/informer/modeling_informer.py +3 -3
  442. transformers/models/instructblip/configuration_instructblip.py +2 -1
  443. transformers/models/instructblip/modeling_instructblip.py +65 -39
  444. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
  445. transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
  446. transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
  447. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
  448. transformers/models/internvl/configuration_internvl.py +5 -0
  449. transformers/models/internvl/modeling_internvl.py +35 -55
  450. transformers/models/internvl/modular_internvl.py +26 -38
  451. transformers/models/internvl/video_processing_internvl.py +2 -2
  452. transformers/models/jais2/configuration_jais2.py +5 -7
  453. transformers/models/jais2/modeling_jais2.py +4 -4
  454. transformers/models/jamba/configuration_jamba.py +5 -7
  455. transformers/models/jamba/modeling_jamba.py +4 -4
  456. transformers/models/jamba/modular_jamba.py +3 -3
  457. transformers/models/janus/image_processing_janus.py +2 -2
  458. transformers/models/janus/image_processing_janus_fast.py +8 -8
  459. transformers/models/janus/modeling_janus.py +63 -146
  460. transformers/models/janus/modular_janus.py +62 -20
  461. transformers/models/jetmoe/configuration_jetmoe.py +6 -4
  462. transformers/models/jetmoe/modeling_jetmoe.py +3 -3
  463. transformers/models/jetmoe/modular_jetmoe.py +3 -3
  464. transformers/models/kosmos2/configuration_kosmos2.py +10 -8
  465. transformers/models/kosmos2/modeling_kosmos2.py +56 -34
  466. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
  467. transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
  468. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
  469. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
  470. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
  471. transformers/models/lasr/configuration_lasr.py +2 -4
  472. transformers/models/lasr/modeling_lasr.py +3 -3
  473. transformers/models/lasr/modular_lasr.py +3 -3
  474. transformers/models/layoutlm/configuration_layoutlm.py +14 -1
  475. transformers/models/layoutlm/modeling_layoutlm.py +3 -3
  476. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
  477. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
  478. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
  479. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
  480. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
  481. transformers/models/led/configuration_led.py +7 -8
  482. transformers/models/levit/image_processing_levit_fast.py +4 -4
  483. transformers/models/lfm2/configuration_lfm2.py +5 -7
  484. transformers/models/lfm2/modeling_lfm2.py +4 -4
  485. transformers/models/lfm2/modular_lfm2.py +3 -3
  486. transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
  487. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
  488. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  489. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
  490. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
  491. transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
  492. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
  493. transformers/models/lightglue/modeling_lightglue.py +3 -3
  494. transformers/models/lightglue/modular_lightglue.py +3 -3
  495. transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
  496. transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
  497. transformers/models/lilt/configuration_lilt.py +6 -1
  498. transformers/models/llama/configuration_llama.py +5 -7
  499. transformers/models/llama/modeling_llama.py +4 -4
  500. transformers/models/llama4/configuration_llama4.py +67 -47
  501. transformers/models/llama4/image_processing_llama4_fast.py +3 -3
  502. transformers/models/llama4/modeling_llama4.py +46 -44
  503. transformers/models/llava/configuration_llava.py +10 -0
  504. transformers/models/llava/image_processing_llava_fast.py +3 -3
  505. transformers/models/llava/modeling_llava.py +38 -65
  506. transformers/models/llava_next/configuration_llava_next.py +2 -1
  507. transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
  508. transformers/models/llava_next/modeling_llava_next.py +61 -60
  509. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
  510. transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
  511. transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
  512. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
  513. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
  514. transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
  515. transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
  516. transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
  517. transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
  518. transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
  519. transformers/models/longformer/configuration_longformer.py +4 -1
  520. transformers/models/longt5/configuration_longt5.py +9 -6
  521. transformers/models/longt5/modeling_longt5.py +2 -1
  522. transformers/models/luke/configuration_luke.py +8 -1
  523. transformers/models/lw_detr/configuration_lw_detr.py +19 -31
  524. transformers/models/lw_detr/modeling_lw_detr.py +43 -44
  525. transformers/models/lw_detr/modular_lw_detr.py +36 -38
  526. transformers/models/lxmert/configuration_lxmert.py +16 -0
  527. transformers/models/m2m_100/configuration_m2m_100.py +7 -8
  528. transformers/models/m2m_100/modeling_m2m_100.py +3 -3
  529. transformers/models/mamba/configuration_mamba.py +5 -2
  530. transformers/models/mamba/modeling_mamba.py +18 -26
  531. transformers/models/mamba2/configuration_mamba2.py +5 -7
  532. transformers/models/mamba2/modeling_mamba2.py +22 -33
  533. transformers/models/marian/configuration_marian.py +10 -4
  534. transformers/models/marian/modeling_marian.py +4 -4
  535. transformers/models/markuplm/configuration_markuplm.py +4 -6
  536. transformers/models/markuplm/modeling_markuplm.py +3 -3
  537. transformers/models/mask2former/configuration_mask2former.py +12 -47
  538. transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
  539. transformers/models/mask2former/modeling_mask2former.py +18 -12
  540. transformers/models/maskformer/configuration_maskformer.py +14 -45
  541. transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
  542. transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
  543. transformers/models/maskformer/modeling_maskformer.py +15 -9
  544. transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
  545. transformers/models/mbart/configuration_mbart.py +9 -4
  546. transformers/models/mbart/modeling_mbart.py +9 -6
  547. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
  548. transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
  549. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  550. transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
  551. transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
  552. transformers/models/mgp_str/modeling_mgp_str.py +4 -2
  553. transformers/models/mimi/configuration_mimi.py +4 -0
  554. transformers/models/mimi/modeling_mimi.py +40 -36
  555. transformers/models/minimax/configuration_minimax.py +8 -11
  556. transformers/models/minimax/modeling_minimax.py +5 -5
  557. transformers/models/minimax/modular_minimax.py +9 -12
  558. transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
  559. transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
  560. transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
  561. transformers/models/ministral/configuration_ministral.py +5 -7
  562. transformers/models/ministral/modeling_ministral.py +4 -4
  563. transformers/models/ministral/modular_ministral.py +5 -8
  564. transformers/models/ministral3/configuration_ministral3.py +4 -4
  565. transformers/models/ministral3/modeling_ministral3.py +4 -4
  566. transformers/models/ministral3/modular_ministral3.py +3 -3
  567. transformers/models/mistral/configuration_mistral.py +5 -7
  568. transformers/models/mistral/modeling_mistral.py +4 -4
  569. transformers/models/mistral/modular_mistral.py +3 -3
  570. transformers/models/mistral3/configuration_mistral3.py +4 -0
  571. transformers/models/mistral3/modeling_mistral3.py +36 -40
  572. transformers/models/mistral3/modular_mistral3.py +31 -32
  573. transformers/models/mixtral/configuration_mixtral.py +8 -11
  574. transformers/models/mixtral/modeling_mixtral.py +4 -4
  575. transformers/models/mlcd/modeling_mlcd.py +7 -5
  576. transformers/models/mlcd/modular_mlcd.py +7 -5
  577. transformers/models/mllama/configuration_mllama.py +5 -7
  578. transformers/models/mllama/image_processing_mllama_fast.py +6 -5
  579. transformers/models/mllama/modeling_mllama.py +19 -19
  580. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
  581. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
  582. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
  583. transformers/models/mobilebert/configuration_mobilebert.py +4 -1
  584. transformers/models/mobilebert/modeling_mobilebert.py +3 -3
  585. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
  586. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
  587. transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
  588. transformers/models/mobilevit/modeling_mobilevit.py +4 -2
  589. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
  590. transformers/models/modernbert/configuration_modernbert.py +46 -21
  591. transformers/models/modernbert/modeling_modernbert.py +146 -899
  592. transformers/models/modernbert/modular_modernbert.py +185 -908
  593. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
  594. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
  595. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
  596. transformers/models/moonshine/configuration_moonshine.py +12 -7
  597. transformers/models/moonshine/modeling_moonshine.py +7 -7
  598. transformers/models/moonshine/modular_moonshine.py +19 -13
  599. transformers/models/moshi/configuration_moshi.py +28 -2
  600. transformers/models/moshi/modeling_moshi.py +4 -9
  601. transformers/models/mpnet/configuration_mpnet.py +6 -1
  602. transformers/models/mpt/configuration_mpt.py +16 -0
  603. transformers/models/mra/configuration_mra.py +8 -1
  604. transformers/models/mt5/configuration_mt5.py +9 -5
  605. transformers/models/mt5/modeling_mt5.py +5 -8
  606. transformers/models/musicgen/configuration_musicgen.py +12 -7
  607. transformers/models/musicgen/modeling_musicgen.py +6 -5
  608. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
  609. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
  610. transformers/models/mvp/configuration_mvp.py +8 -4
  611. transformers/models/mvp/modeling_mvp.py +6 -4
  612. transformers/models/nanochat/configuration_nanochat.py +5 -7
  613. transformers/models/nanochat/modeling_nanochat.py +4 -4
  614. transformers/models/nanochat/modular_nanochat.py +4 -4
  615. transformers/models/nemotron/configuration_nemotron.py +5 -7
  616. transformers/models/nemotron/modeling_nemotron.py +4 -14
  617. transformers/models/nllb/tokenization_nllb.py +7 -5
  618. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
  619. transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
  620. transformers/models/nougat/image_processing_nougat_fast.py +8 -8
  621. transformers/models/nystromformer/configuration_nystromformer.py +8 -1
  622. transformers/models/olmo/configuration_olmo.py +5 -7
  623. transformers/models/olmo/modeling_olmo.py +4 -4
  624. transformers/models/olmo/modular_olmo.py +3 -3
  625. transformers/models/olmo2/configuration_olmo2.py +9 -11
  626. transformers/models/olmo2/modeling_olmo2.py +4 -4
  627. transformers/models/olmo2/modular_olmo2.py +7 -7
  628. transformers/models/olmo3/configuration_olmo3.py +10 -11
  629. transformers/models/olmo3/modeling_olmo3.py +4 -4
  630. transformers/models/olmo3/modular_olmo3.py +13 -14
  631. transformers/models/olmoe/configuration_olmoe.py +5 -7
  632. transformers/models/olmoe/modeling_olmoe.py +4 -4
  633. transformers/models/olmoe/modular_olmoe.py +3 -3
  634. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
  635. transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
  636. transformers/models/oneformer/configuration_oneformer.py +9 -46
  637. transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
  638. transformers/models/oneformer/modeling_oneformer.py +14 -9
  639. transformers/models/openai/configuration_openai.py +16 -0
  640. transformers/models/opt/configuration_opt.py +6 -6
  641. transformers/models/opt/modeling_opt.py +5 -5
  642. transformers/models/ovis2/configuration_ovis2.py +4 -0
  643. transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
  644. transformers/models/ovis2/modeling_ovis2.py +58 -99
  645. transformers/models/ovis2/modular_ovis2.py +52 -13
  646. transformers/models/owlv2/configuration_owlv2.py +4 -1
  647. transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
  648. transformers/models/owlv2/modeling_owlv2.py +40 -27
  649. transformers/models/owlv2/modular_owlv2.py +5 -5
  650. transformers/models/owlvit/configuration_owlvit.py +4 -1
  651. transformers/models/owlvit/modeling_owlvit.py +40 -27
  652. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
  653. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
  654. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
  655. transformers/models/paligemma/configuration_paligemma.py +4 -0
  656. transformers/models/paligemma/modeling_paligemma.py +30 -26
  657. transformers/models/parakeet/configuration_parakeet.py +2 -4
  658. transformers/models/parakeet/modeling_parakeet.py +3 -3
  659. transformers/models/parakeet/modular_parakeet.py +3 -3
  660. transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
  661. transformers/models/patchtst/modeling_patchtst.py +3 -3
  662. transformers/models/pe_audio/modeling_pe_audio.py +4 -4
  663. transformers/models/pe_audio/modular_pe_audio.py +1 -1
  664. transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
  665. transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
  666. transformers/models/pe_video/modeling_pe_video.py +36 -24
  667. transformers/models/pe_video/modular_pe_video.py +36 -23
  668. transformers/models/pegasus/configuration_pegasus.py +8 -5
  669. transformers/models/pegasus/modeling_pegasus.py +4 -4
  670. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
  671. transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
  672. transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
  673. transformers/models/perceiver/modeling_perceiver.py +17 -9
  674. transformers/models/perception_lm/modeling_perception_lm.py +26 -27
  675. transformers/models/perception_lm/modular_perception_lm.py +27 -25
  676. transformers/models/persimmon/configuration_persimmon.py +5 -7
  677. transformers/models/persimmon/modeling_persimmon.py +5 -5
  678. transformers/models/phi/configuration_phi.py +8 -6
  679. transformers/models/phi/modeling_phi.py +4 -4
  680. transformers/models/phi/modular_phi.py +3 -3
  681. transformers/models/phi3/configuration_phi3.py +9 -11
  682. transformers/models/phi3/modeling_phi3.py +4 -4
  683. transformers/models/phi3/modular_phi3.py +3 -3
  684. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
  685. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
  686. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
  687. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
  688. transformers/models/phimoe/configuration_phimoe.py +5 -7
  689. transformers/models/phimoe/modeling_phimoe.py +15 -39
  690. transformers/models/phimoe/modular_phimoe.py +12 -7
  691. transformers/models/pix2struct/configuration_pix2struct.py +12 -9
  692. transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
  693. transformers/models/pix2struct/modeling_pix2struct.py +14 -7
  694. transformers/models/pixio/configuration_pixio.py +2 -4
  695. transformers/models/pixio/modeling_pixio.py +9 -8
  696. transformers/models/pixio/modular_pixio.py +4 -2
  697. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
  698. transformers/models/pixtral/modeling_pixtral.py +9 -12
  699. transformers/models/plbart/configuration_plbart.py +8 -5
  700. transformers/models/plbart/modeling_plbart.py +9 -7
  701. transformers/models/plbart/modular_plbart.py +1 -1
  702. transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
  703. transformers/models/pop2piano/configuration_pop2piano.py +7 -6
  704. transformers/models/pop2piano/modeling_pop2piano.py +2 -1
  705. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  706. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  707. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  708. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  709. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  710. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
  711. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
  712. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
  713. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
  714. transformers/models/prophetnet/configuration_prophetnet.py +11 -10
  715. transformers/models/prophetnet/modeling_prophetnet.py +12 -23
  716. transformers/models/pvt/image_processing_pvt.py +7 -7
  717. transformers/models/pvt/image_processing_pvt_fast.py +1 -1
  718. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  719. transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
  720. transformers/models/qwen2/configuration_qwen2.py +14 -4
  721. transformers/models/qwen2/modeling_qwen2.py +4 -4
  722. transformers/models/qwen2/modular_qwen2.py +3 -3
  723. transformers/models/qwen2/tokenization_qwen2.py +0 -4
  724. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
  725. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
  726. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
  727. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
  728. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
  729. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
  730. transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
  731. transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
  732. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  733. transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
  734. transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
  735. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
  736. transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
  737. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
  738. transformers/models/qwen3/configuration_qwen3.py +15 -5
  739. transformers/models/qwen3/modeling_qwen3.py +4 -4
  740. transformers/models/qwen3/modular_qwen3.py +3 -3
  741. transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
  742. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  743. transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
  744. transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
  745. transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
  746. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
  747. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
  748. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
  749. transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
  750. transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
  751. transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
  752. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
  753. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
  754. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
  755. transformers/models/rag/configuration_rag.py +6 -6
  756. transformers/models/rag/modeling_rag.py +3 -3
  757. transformers/models/rag/retrieval_rag.py +1 -1
  758. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
  759. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
  760. transformers/models/reformer/configuration_reformer.py +7 -7
  761. transformers/models/rembert/configuration_rembert.py +8 -1
  762. transformers/models/rembert/modeling_rembert.py +0 -22
  763. transformers/models/resnet/configuration_resnet.py +2 -4
  764. transformers/models/resnet/modeling_resnet.py +6 -5
  765. transformers/models/roberta/configuration_roberta.py +11 -2
  766. transformers/models/roberta/modeling_roberta.py +6 -6
  767. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
  768. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
  769. transformers/models/roc_bert/configuration_roc_bert.py +8 -1
  770. transformers/models/roc_bert/modeling_roc_bert.py +6 -41
  771. transformers/models/roformer/configuration_roformer.py +13 -2
  772. transformers/models/roformer/modeling_roformer.py +0 -14
  773. transformers/models/rt_detr/configuration_rt_detr.py +8 -49
  774. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
  775. transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
  776. transformers/models/rt_detr/modeling_rt_detr.py +578 -737
  777. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
  778. transformers/models/rt_detr/modular_rt_detr.py +1508 -6
  779. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
  780. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
  781. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
  782. transformers/models/rwkv/configuration_rwkv.py +2 -3
  783. transformers/models/rwkv/modeling_rwkv.py +0 -23
  784. transformers/models/sam/configuration_sam.py +2 -0
  785. transformers/models/sam/image_processing_sam_fast.py +4 -4
  786. transformers/models/sam/modeling_sam.py +13 -8
  787. transformers/models/sam/processing_sam.py +3 -3
  788. transformers/models/sam2/configuration_sam2.py +1 -1
  789. transformers/models/sam2/modeling_sam2.py +56 -52
  790. transformers/models/sam2/modular_sam2.py +47 -55
  791. transformers/models/sam2_video/modeling_sam2_video.py +50 -51
  792. transformers/models/sam2_video/modular_sam2_video.py +12 -10
  793. transformers/models/sam3/modeling_sam3.py +43 -47
  794. transformers/models/sam3/processing_sam3.py +8 -4
  795. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
  796. transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
  797. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  798. transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
  799. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
  800. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
  801. transformers/models/sam3_video/modeling_sam3_video.py +27 -14
  802. transformers/models/sam_hq/configuration_sam_hq.py +2 -0
  803. transformers/models/sam_hq/modeling_sam_hq.py +13 -9
  804. transformers/models/sam_hq/modular_sam_hq.py +6 -6
  805. transformers/models/sam_hq/processing_sam_hq.py +7 -6
  806. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
  807. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
  808. transformers/models/seed_oss/configuration_seed_oss.py +7 -9
  809. transformers/models/seed_oss/modeling_seed_oss.py +4 -4
  810. transformers/models/seed_oss/modular_seed_oss.py +3 -3
  811. transformers/models/segformer/image_processing_segformer_fast.py +4 -4
  812. transformers/models/segformer/modeling_segformer.py +4 -2
  813. transformers/models/segformer/modular_segformer.py +3 -3
  814. transformers/models/seggpt/modeling_seggpt.py +20 -8
  815. transformers/models/sew/configuration_sew.py +4 -1
  816. transformers/models/sew/modeling_sew.py +9 -5
  817. transformers/models/sew/modular_sew.py +2 -1
  818. transformers/models/sew_d/configuration_sew_d.py +4 -1
  819. transformers/models/sew_d/modeling_sew_d.py +4 -1
  820. transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
  821. transformers/models/siglip/configuration_siglip.py +4 -1
  822. transformers/models/siglip/modeling_siglip.py +27 -71
  823. transformers/models/siglip2/__init__.py +1 -0
  824. transformers/models/siglip2/configuration_siglip2.py +4 -2
  825. transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
  826. transformers/models/siglip2/modeling_siglip2.py +37 -78
  827. transformers/models/siglip2/modular_siglip2.py +74 -25
  828. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  829. transformers/models/smollm3/configuration_smollm3.py +6 -6
  830. transformers/models/smollm3/modeling_smollm3.py +4 -4
  831. transformers/models/smollm3/modular_smollm3.py +9 -9
  832. transformers/models/smolvlm/configuration_smolvlm.py +1 -3
  833. transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
  834. transformers/models/smolvlm/modeling_smolvlm.py +75 -46
  835. transformers/models/smolvlm/modular_smolvlm.py +36 -23
  836. transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
  837. transformers/models/solar_open/__init__.py +27 -0
  838. transformers/models/solar_open/configuration_solar_open.py +184 -0
  839. transformers/models/solar_open/modeling_solar_open.py +642 -0
  840. transformers/models/solar_open/modular_solar_open.py +224 -0
  841. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
  842. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
  843. transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
  844. transformers/models/speecht5/configuration_speecht5.py +7 -8
  845. transformers/models/splinter/configuration_splinter.py +6 -6
  846. transformers/models/splinter/modeling_splinter.py +8 -3
  847. transformers/models/squeezebert/configuration_squeezebert.py +14 -1
  848. transformers/models/stablelm/configuration_stablelm.py +8 -6
  849. transformers/models/stablelm/modeling_stablelm.py +5 -5
  850. transformers/models/starcoder2/configuration_starcoder2.py +11 -5
  851. transformers/models/starcoder2/modeling_starcoder2.py +5 -5
  852. transformers/models/starcoder2/modular_starcoder2.py +4 -4
  853. transformers/models/superglue/configuration_superglue.py +4 -0
  854. transformers/models/superglue/image_processing_superglue_fast.py +4 -3
  855. transformers/models/superglue/modeling_superglue.py +9 -4
  856. transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
  857. transformers/models/superpoint/modeling_superpoint.py +4 -2
  858. transformers/models/swin/configuration_swin.py +2 -4
  859. transformers/models/swin/modeling_swin.py +11 -8
  860. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
  861. transformers/models/swin2sr/modeling_swin2sr.py +4 -2
  862. transformers/models/swinv2/configuration_swinv2.py +2 -4
  863. transformers/models/swinv2/modeling_swinv2.py +10 -7
  864. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
  865. transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
  866. transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
  867. transformers/models/t5/configuration_t5.py +9 -8
  868. transformers/models/t5/modeling_t5.py +5 -8
  869. transformers/models/t5gemma/configuration_t5gemma.py +10 -25
  870. transformers/models/t5gemma/modeling_t5gemma.py +9 -9
  871. transformers/models/t5gemma/modular_t5gemma.py +11 -24
  872. transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
  873. transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
  874. transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
  875. transformers/models/table_transformer/configuration_table_transformer.py +18 -49
  876. transformers/models/table_transformer/modeling_table_transformer.py +27 -53
  877. transformers/models/tapas/configuration_tapas.py +12 -1
  878. transformers/models/tapas/modeling_tapas.py +1 -1
  879. transformers/models/tapas/tokenization_tapas.py +1 -0
  880. transformers/models/textnet/configuration_textnet.py +4 -6
  881. transformers/models/textnet/image_processing_textnet_fast.py +3 -3
  882. transformers/models/textnet/modeling_textnet.py +15 -14
  883. transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
  884. transformers/models/timesfm/modeling_timesfm.py +5 -6
  885. transformers/models/timesfm/modular_timesfm.py +5 -6
  886. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
  887. transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
  888. transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
  889. transformers/models/trocr/configuration_trocr.py +11 -7
  890. transformers/models/trocr/modeling_trocr.py +4 -2
  891. transformers/models/tvp/configuration_tvp.py +10 -35
  892. transformers/models/tvp/image_processing_tvp_fast.py +6 -5
  893. transformers/models/tvp/modeling_tvp.py +1 -1
  894. transformers/models/udop/configuration_udop.py +16 -7
  895. transformers/models/udop/modeling_udop.py +10 -6
  896. transformers/models/umt5/configuration_umt5.py +8 -6
  897. transformers/models/umt5/modeling_umt5.py +7 -3
  898. transformers/models/unispeech/configuration_unispeech.py +4 -1
  899. transformers/models/unispeech/modeling_unispeech.py +7 -4
  900. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
  901. transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
  902. transformers/models/upernet/configuration_upernet.py +8 -35
  903. transformers/models/upernet/modeling_upernet.py +1 -1
  904. transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
  905. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  906. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  907. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
  908. transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
  909. transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
  910. transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
  911. transformers/models/video_llava/configuration_video_llava.py +4 -0
  912. transformers/models/video_llava/modeling_video_llava.py +87 -89
  913. transformers/models/videomae/modeling_videomae.py +4 -5
  914. transformers/models/vilt/configuration_vilt.py +4 -1
  915. transformers/models/vilt/image_processing_vilt_fast.py +6 -6
  916. transformers/models/vilt/modeling_vilt.py +27 -12
  917. transformers/models/vipllava/configuration_vipllava.py +4 -0
  918. transformers/models/vipllava/modeling_vipllava.py +57 -31
  919. transformers/models/vipllava/modular_vipllava.py +50 -24
  920. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
  921. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
  922. transformers/models/visual_bert/configuration_visual_bert.py +6 -1
  923. transformers/models/vit/configuration_vit.py +2 -2
  924. transformers/models/vit/modeling_vit.py +7 -5
  925. transformers/models/vit_mae/modeling_vit_mae.py +11 -7
  926. transformers/models/vit_msn/modeling_vit_msn.py +11 -7
  927. transformers/models/vitdet/configuration_vitdet.py +2 -4
  928. transformers/models/vitdet/modeling_vitdet.py +2 -3
  929. transformers/models/vitmatte/configuration_vitmatte.py +6 -35
  930. transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
  931. transformers/models/vitmatte/modeling_vitmatte.py +1 -1
  932. transformers/models/vitpose/configuration_vitpose.py +6 -43
  933. transformers/models/vitpose/modeling_vitpose.py +5 -3
  934. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
  935. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
  936. transformers/models/vits/configuration_vits.py +4 -0
  937. transformers/models/vits/modeling_vits.py +9 -7
  938. transformers/models/vivit/modeling_vivit.py +4 -4
  939. transformers/models/vjepa2/modeling_vjepa2.py +9 -9
  940. transformers/models/voxtral/configuration_voxtral.py +0 -1
  941. transformers/models/voxtral/modeling_voxtral.py +25 -24
  942. transformers/models/voxtral/modular_voxtral.py +26 -20
  943. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
  944. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
  945. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
  946. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
  947. transformers/models/wavlm/configuration_wavlm.py +4 -1
  948. transformers/models/wavlm/modeling_wavlm.py +4 -1
  949. transformers/models/whisper/configuration_whisper.py +6 -4
  950. transformers/models/whisper/generation_whisper.py +0 -1
  951. transformers/models/whisper/modeling_whisper.py +3 -3
  952. transformers/models/x_clip/configuration_x_clip.py +4 -1
  953. transformers/models/x_clip/modeling_x_clip.py +26 -27
  954. transformers/models/xglm/configuration_xglm.py +9 -7
  955. transformers/models/xlm/configuration_xlm.py +10 -7
  956. transformers/models/xlm/modeling_xlm.py +1 -1
  957. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
  958. transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
  959. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
  960. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
  961. transformers/models/xlnet/configuration_xlnet.py +3 -1
  962. transformers/models/xlstm/configuration_xlstm.py +5 -7
  963. transformers/models/xlstm/modeling_xlstm.py +0 -32
  964. transformers/models/xmod/configuration_xmod.py +11 -2
  965. transformers/models/xmod/modeling_xmod.py +13 -16
  966. transformers/models/yolos/image_processing_yolos_fast.py +25 -28
  967. transformers/models/yolos/modeling_yolos.py +7 -7
  968. transformers/models/yolos/modular_yolos.py +16 -16
  969. transformers/models/yoso/configuration_yoso.py +8 -1
  970. transformers/models/youtu/__init__.py +27 -0
  971. transformers/models/youtu/configuration_youtu.py +194 -0
  972. transformers/models/youtu/modeling_youtu.py +619 -0
  973. transformers/models/youtu/modular_youtu.py +254 -0
  974. transformers/models/zamba/configuration_zamba.py +5 -7
  975. transformers/models/zamba/modeling_zamba.py +25 -56
  976. transformers/models/zamba2/configuration_zamba2.py +8 -13
  977. transformers/models/zamba2/modeling_zamba2.py +53 -78
  978. transformers/models/zamba2/modular_zamba2.py +36 -29
  979. transformers/models/zoedepth/configuration_zoedepth.py +17 -40
  980. transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
  981. transformers/models/zoedepth/modeling_zoedepth.py +5 -3
  982. transformers/pipelines/__init__.py +1 -61
  983. transformers/pipelines/any_to_any.py +1 -1
  984. transformers/pipelines/automatic_speech_recognition.py +0 -2
  985. transformers/pipelines/base.py +1 -1
  986. transformers/pipelines/image_text_to_text.py +1 -1
  987. transformers/pipelines/text_to_audio.py +5 -1
  988. transformers/processing_utils.py +35 -44
  989. transformers/pytorch_utils.py +2 -26
  990. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  991. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  992. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  993. transformers/quantizers/quantizer_mxfp4.py +1 -1
  994. transformers/quantizers/quantizer_torchao.py +0 -16
  995. transformers/safetensors_conversion.py +11 -4
  996. transformers/testing_utils.py +3 -28
  997. transformers/tokenization_mistral_common.py +9 -0
  998. transformers/tokenization_python.py +6 -4
  999. transformers/tokenization_utils_base.py +119 -219
  1000. transformers/tokenization_utils_tokenizers.py +31 -2
  1001. transformers/trainer.py +25 -33
  1002. transformers/trainer_seq2seq.py +1 -1
  1003. transformers/training_args.py +411 -417
  1004. transformers/utils/__init__.py +1 -4
  1005. transformers/utils/auto_docstring.py +15 -18
  1006. transformers/utils/backbone_utils.py +13 -373
  1007. transformers/utils/doc.py +4 -36
  1008. transformers/utils/generic.py +69 -33
  1009. transformers/utils/import_utils.py +72 -75
  1010. transformers/utils/loading_report.py +133 -105
  1011. transformers/utils/quantization_config.py +0 -21
  1012. transformers/video_processing_utils.py +5 -5
  1013. transformers/video_utils.py +3 -1
  1014. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
  1015. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
  1016. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1017. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1018. transformers/pipelines/image_to_text.py +0 -189
  1019. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1020. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1021. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -130,6 +130,9 @@ class T5GemmaModuleConfig(Gemma2Config):
130
130
  scaling factor when applying tanh softcapping on the logits.
131
131
  attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
132
132
  scaling factor when applying tanh softcapping on the attention scores.
133
+ is_decoder (`bool`, *optional*, defaults to `False`):
134
+ Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on
135
+ decoder-only or encoder-only architectures.
133
136
 
134
137
  ```python
135
138
  >>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
@@ -167,8 +170,10 @@ class T5GemmaModuleConfig(Gemma2Config):
167
170
  layer_types: list[str] | None = None,
168
171
  final_logit_softcapping: float | None = 30.0,
169
172
  attn_logit_softcapping: float | None = 50.0,
173
+ is_decoder: bool | None = False,
170
174
  **kwargs,
171
175
  ):
176
+ self.is_decoder = is_decoder
172
177
  super().__init__(
173
178
  vocab_size=vocab_size,
174
179
  hidden_size=hidden_size,
@@ -286,31 +291,13 @@ class T5GemmaConfig(PreTrainedConfig):
286
291
  super().__init__(**kwargs)
287
292
 
288
293
  self.is_encoder_decoder = is_encoder_decoder
289
- self.use_cache = kwargs.get("use_cache", decoder.use_cache)
290
294
  self.initializer_range = kwargs.get("initializer_range", decoder.initializer_range)
291
- self.dropout_rate = dropout_rate
292
- self.attention_dropout = attention_dropout
293
295
  self.classifier_dropout_rate = classifier_dropout_rate
294
296
  self.tie_word_embeddings = tie_word_embeddings
295
297
 
296
298
  # Used in pipeline generation.
297
299
  self.vocab_size = vocab_size
298
300
 
299
- def __setattr__(self, key, value):
300
- shared_attr_with_submodules = [
301
- "output_hidden_states",
302
- "output_attentions",
303
- "_attn_implementation",
304
- "dropout_rate",
305
- "attention_dropout",
306
- "vocab_size",
307
- ]
308
-
309
- if key in shared_attr_with_submodules:
310
- setattr(self.encoder, key, value)
311
- setattr(self.decoder, key, value)
312
- super().__setattr__(key, value)
313
-
314
301
 
315
302
  class T5GemmaRMSNorm(Gemma2RMSNorm):
316
303
  pass
@@ -388,9 +375,9 @@ class T5GemmaCrossAttention(Gemma2Attention):
388
375
  key_states = curr_past_key_values.layers[self.layer_idx].keys
389
376
  value_states = curr_past_key_values.layers[self.layer_idx].values
390
377
 
391
- attention_interface: Callable = eager_attention_forward
392
- if self.config._attn_implementation != "eager":
393
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
378
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
379
+ self.config._attn_implementation, eager_attention_forward
380
+ )
394
381
 
395
382
  attn_output, attn_weights = attention_interface(
396
383
  self,
@@ -696,7 +683,7 @@ class T5GemmaEncoder(T5GemmaPreTrainedModel):
696
683
  position_ids: torch.LongTensor | None = None,
697
684
  inputs_embeds: torch.FloatTensor | None = None,
698
685
  **kwargs: Unpack[TransformersKwargs],
699
- ) -> BaseModelOutput:
686
+ ) -> tuple | BaseModelOutput:
700
687
  if (input_ids is None) ^ (inputs_embeds is not None):
701
688
  raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
702
689
 
@@ -795,7 +782,7 @@ class T5GemmaDecoder(T5GemmaPreTrainedModel):
795
782
  encoder_hidden_states: torch.Tensor | None = None,
796
783
  encoder_attention_mask: torch.Tensor | None = None,
797
784
  **kwargs: Unpack[TransformersKwargs],
798
- ) -> BaseModelOutputWithPastAndCrossAttentions:
785
+ ) -> tuple | BaseModelOutputWithPastAndCrossAttentions:
799
786
  if (input_ids is None) ^ (inputs_embeds is not None):
800
787
  raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
801
788
  if encoder_hidden_states is None:
@@ -997,7 +984,7 @@ class T5GemmaEncoderModel(T5GemmaPreTrainedModel):
997
984
 
998
985
  class T5GemmaForConditionalGeneration(T5GemmaPreTrainedModel, GenerationMixin):
999
986
  _tied_weights_keys = {"lm_head.out_proj.weight": "model.decoder.embed_tokens.weight"}
1000
- _tp_plan = {"lm_head.out_proj": "colwise_rep"}
987
+ _tp_plan = {"lm_head.out_proj": "colwise_gather_output"}
1001
988
  _pp_plan = {"lm_head.out_proj": (["hidden_states"], ["logits"])}
1002
989
 
1003
990
  def __init__(self, config: T5GemmaConfig):
@@ -78,8 +78,6 @@ class T5Gemma2TextConfig(PreTrainedConfig):
78
78
  End of stream token id.
79
79
  bos_token_id (`int`, *optional*, defaults to 2):
80
80
  Beginning of stream token id.
81
- tie_word_embeddings (`bool`, *optional*, defaults to `True`):
82
- Whether to tie weight embeddings
83
81
  attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
84
82
  Whether to use a bias in the query, key, value and output projection layers during self-attention.
85
83
  attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -135,7 +133,6 @@ class T5Gemma2TextConfig(PreTrainedConfig):
135
133
  pad_token_id: int | None = 0,
136
134
  eos_token_id: int | None = 1,
137
135
  bos_token_id: int | None = 2,
138
- tie_word_embeddings: bool | None = True,
139
136
  attention_bias: bool | None = False,
140
137
  attention_dropout: float | None = 0.0,
141
138
  query_pre_attn_scalar: int | None = 256,
@@ -146,6 +143,9 @@ class T5Gemma2TextConfig(PreTrainedConfig):
146
143
  rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
147
144
  **kwargs,
148
145
  ):
146
+ self.pad_token_id = pad_token_id
147
+ self.bos_token_id = bos_token_id
148
+ self.eos_token_id = eos_token_id
149
149
  self.vocab_size = vocab_size
150
150
  self.max_position_embeddings = max_position_embeddings
151
151
  self.hidden_size = hidden_size
@@ -177,13 +177,7 @@ class T5Gemma2TextConfig(PreTrainedConfig):
177
177
  layer_type_validation(self.layer_types, self.num_hidden_layers)
178
178
 
179
179
  self.rope_parameters = rope_parameters
180
- super().__init__(
181
- pad_token_id=pad_token_id,
182
- bos_token_id=bos_token_id,
183
- eos_token_id=eos_token_id,
184
- tie_word_embeddings=tie_word_embeddings,
185
- **kwargs,
186
- )
180
+ super().__init__(**kwargs)
187
181
 
188
182
  def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs):
189
183
  rope_scaling = kwargs.pop("rope_scaling", None)
@@ -197,9 +191,15 @@ class T5Gemma2TextConfig(PreTrainedConfig):
197
191
  self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params
198
192
  if rope_scaling is not None:
199
193
  self.rope_parameters["full_attention"].update(rope_scaling)
194
+
195
+ # Set default values if not present
196
+ if self.rope_parameters.get("full_attention") is None:
197
+ self.rope_parameters["full_attention"] = {"rope_type": "default"}
200
198
  self.rope_parameters["full_attention"].setdefault(
201
199
  "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"])
202
200
  )
201
+ if self.rope_parameters.get("sliding_attention") is None:
202
+ self.rope_parameters["sliding_attention"] = {"rope_type": "default"}
203
203
  self.rope_parameters["sliding_attention"].setdefault(
204
204
  "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"])
205
205
  )
@@ -236,7 +236,8 @@ class T5Gemma2EncoderConfig(PreTrainedConfig):
236
236
  The image token index to encode the image prompt.
237
237
  initializer_range (`float`, *optional*, defaults to 0.02):
238
238
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
239
-
239
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
240
+ Whether to tie weight embeddings
240
241
 
241
242
  Example:
242
243
 
@@ -275,11 +276,12 @@ class T5Gemma2EncoderConfig(PreTrainedConfig):
275
276
  self,
276
277
  text_config: T5Gemma2TextConfig | dict[str, Any] | None = None,
277
278
  vision_config: SiglipVisionConfig | dict[str, Any] | None = None,
278
- mm_tokens_per_image: int = 256,
279
- boi_token_index: int = 255_999,
280
- eoi_token_index: int = 256_000,
281
- image_token_index: int = 262_144,
282
- initializer_range: float = 0.02,
279
+ mm_tokens_per_image: int | None = 256,
280
+ boi_token_index: int | None = 255_999,
281
+ eoi_token_index: int | None = 256_000,
282
+ image_token_index: int | None = 262_144,
283
+ initializer_range: float | None = 0.02,
284
+ tie_word_embeddings: bool | None = True,
283
285
  **kwargs,
284
286
  ):
285
287
  if text_config is None:
@@ -301,6 +303,7 @@ class T5Gemma2EncoderConfig(PreTrainedConfig):
301
303
  self.eoi_token_index = eoi_token_index
302
304
  self.image_token_index = image_token_index
303
305
  self.initializer_range = initializer_range
306
+ self.tie_word_embeddings = tie_word_embeddings
304
307
 
305
308
  super().__init__(**kwargs)
306
309
 
@@ -354,8 +357,6 @@ class T5Gemma2DecoderConfig(PreTrainedConfig):
354
357
  End of stream token id.
355
358
  bos_token_id (`int`, *optional*, defaults to 2):
356
359
  Beginning of stream token id.
357
- tie_word_embeddings (`bool`, *optional*, defaults to `True`):
358
- Whether to tie weight embeddings
359
360
  attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
360
361
  Whether to use a bias in the query, key, value and output projection layers during self-attention.
361
362
  attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -411,7 +412,6 @@ class T5Gemma2DecoderConfig(PreTrainedConfig):
411
412
  pad_token_id: int | None = 0,
412
413
  eos_token_id: int | None = 1,
413
414
  bos_token_id: int | None = 2,
414
- tie_word_embeddings: bool | None = True,
415
415
  attention_bias: bool | None = False,
416
416
  attention_dropout: float | None = 0.0,
417
417
  query_pre_attn_scalar: int | None = 256,
@@ -422,6 +422,9 @@ class T5Gemma2DecoderConfig(PreTrainedConfig):
422
422
  rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
423
423
  **kwargs,
424
424
  ):
425
+ self.pad_token_id = pad_token_id
426
+ self.bos_token_id = bos_token_id
427
+ self.eos_token_id = eos_token_id
425
428
  self.vocab_size = vocab_size
426
429
  self.max_position_embeddings = max_position_embeddings
427
430
  self.hidden_size = hidden_size
@@ -453,13 +456,7 @@ class T5Gemma2DecoderConfig(PreTrainedConfig):
453
456
  layer_type_validation(self.layer_types, self.num_hidden_layers)
454
457
 
455
458
  self.rope_parameters = rope_parameters
456
- super().__init__(
457
- pad_token_id=pad_token_id,
458
- bos_token_id=bos_token_id,
459
- eos_token_id=eos_token_id,
460
- tie_word_embeddings=tie_word_embeddings,
461
- **kwargs,
462
- )
459
+ super().__init__(**kwargs)
463
460
 
464
461
  def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs):
465
462
  rope_scaling = kwargs.pop("rope_scaling", None)
@@ -473,9 +470,15 @@ class T5Gemma2DecoderConfig(PreTrainedConfig):
473
470
  self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params
474
471
  if rope_scaling is not None:
475
472
  self.rope_parameters["full_attention"].update(rope_scaling)
473
+
474
+ # Set default values if not present
475
+ if self.rope_parameters.get("full_attention") is None:
476
+ self.rope_parameters["full_attention"] = {"rope_type": "default"}
476
477
  self.rope_parameters["full_attention"].setdefault(
477
478
  "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"])
478
479
  )
480
+ if self.rope_parameters.get("sliding_attention") is None:
481
+ self.rope_parameters["sliding_attention"] = {"rope_type": "default"}
479
482
  self.rope_parameters["sliding_attention"].setdefault(
480
483
  "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"])
481
484
  )
@@ -513,6 +516,9 @@ class T5Gemma2Config(PreTrainedConfig):
513
516
  image_token_index (`int`, *optional*, defaults to 256001):
514
517
  The image token index to encode the image prompt. Defaults to 256001, which is right after the eoi_token_index.
515
518
  Note this is different from Gemma 3.
519
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
520
+ Whether to tie weight embeddings
521
+
516
522
  ```python
517
523
  >>> from transformers import T5Gemma2Config, T5Gemma2Model
518
524
  >>> t5gemma2_config = T5Gemma2Config.from_pretrained("google/t5gemma-270m-270m")
@@ -543,6 +549,7 @@ class T5Gemma2Config(PreTrainedConfig):
543
549
  classifier_dropout_rate: float = 0.0,
544
550
  initializer_range: float = 0.02,
545
551
  image_token_index: int = 256_001,
552
+ tie_word_embeddings: bool | None = True,
546
553
  **kwargs,
547
554
  ):
548
555
  if isinstance(encoder, dict):
@@ -594,33 +601,13 @@ class T5Gemma2Config(PreTrainedConfig):
594
601
  if special_token_key not in kwargs:
595
602
  kwargs[special_token_key] = getattr(decoder, special_token_key)
596
603
 
597
- super().__init__(**kwargs)
598
-
599
- self.is_encoder_decoder = is_encoder_decoder
600
- self.dropout_rate = dropout_rate
601
- self.attention_dropout = attention_dropout
602
604
  self.classifier_dropout_rate = classifier_dropout_rate
603
605
  self.initializer_range = initializer_range
604
606
  self.eoi_token_index = encoder.eoi_token_index
605
607
  self.image_token_index = image_token_index
608
+ self.tie_word_embeddings = tie_word_embeddings
606
609
 
607
- def __setattr__(self, key, value):
608
- shared_attr_with_submodules = [
609
- "output_hidden_states",
610
- "output_attentions",
611
- "_attn_implementation_internal",
612
- "dropout_rate",
613
- "attention_dropout",
614
- "vocab_size",
615
- "dtype",
616
- ]
617
-
618
- if key in shared_attr_with_submodules:
619
- setattr(self.encoder.text_config, key, value)
620
- setattr(self.encoder.vision_config, key, value)
621
- setattr(self.decoder, key, value)
622
- setattr(self.encoder, key, value)
623
- super().__setattr__(key, value)
610
+ super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
624
611
 
625
612
 
626
613
  __all__ = ["T5Gemma2Config", "T5Gemma2TextConfig", "T5Gemma2EncoderConfig", "T5Gemma2DecoderConfig"]
@@ -36,6 +36,7 @@ from ...modeling_layers import GradientCheckpointingLayer
36
36
  from ...modeling_outputs import (
37
37
  BaseModelOutput,
38
38
  BaseModelOutputWithPastAndCrossAttentions,
39
+ BaseModelOutputWithPooling,
39
40
  Seq2SeqLMOutput,
40
41
  Seq2SeqModelOutput,
41
42
  SequenceClassifierOutput,
@@ -44,7 +45,7 @@ from ...modeling_outputs import (
44
45
  from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
45
46
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
46
47
  from ...processing_utils import Unpack
47
- from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
48
+ from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check
48
49
  from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
49
50
  from ..auto import AutoModel
50
51
  from .configuration_t5gemma2 import T5Gemma2Config, T5Gemma2DecoderConfig, T5Gemma2EncoderConfig, T5Gemma2TextConfig
@@ -311,9 +312,9 @@ class T5Gemma2SelfAttention(nn.Module):
311
312
  cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
312
313
  key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
313
314
 
314
- attention_interface: Callable = eager_attention_forward
315
- if self.config._attn_implementation != "eager":
316
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
315
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
316
+ self.config._attn_implementation, eager_attention_forward
317
+ )
317
318
 
318
319
  attn_output, attn_weights = attention_interface(
319
320
  self,
@@ -431,9 +432,9 @@ class T5Gemma2MergedAttention(nn.Module):
431
432
  key_states = torch.cat([key_states, cross_key_states], dim=2)
432
433
  value_states = torch.cat([value_states, cross_value_states], dim=2)
433
434
 
434
- attention_interface: Callable = eager_attention_forward
435
- if self.config._attn_implementation != "eager":
436
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
435
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
436
+ self.config._attn_implementation, eager_attention_forward
437
+ )
437
438
 
438
439
  attn_output, attn_weights = attention_interface(
439
440
  self,
@@ -615,11 +616,11 @@ class T5Gemma2MultiModalProjector(nn.Module):
615
616
  self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size, stride=self.kernel_size)
616
617
 
617
618
  def forward(self, vision_outputs: torch.Tensor):
618
- batch_size, _, seq_length = vision_outputs.shape
619
+ batch_size, _, hidden_size = vision_outputs.shape
619
620
 
620
621
  reshaped_vision_outputs = vision_outputs.transpose(1, 2)
621
622
  reshaped_vision_outputs = reshaped_vision_outputs.reshape(
622
- batch_size, seq_length, self.patches_per_image, self.patches_per_image
623
+ batch_size, hidden_size, self.patches_per_image, self.patches_per_image
623
624
  )
624
625
  reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
625
626
 
@@ -762,8 +763,8 @@ def sliding_window_mask_function(sliding_window: int, is_causal=True) -> Callabl
762
763
  return inner_mask
763
764
 
764
765
 
765
- class T5Gemma2Encoder(T5Gemma2PreTrainedModel):
766
- config: T5Gemma2EncoderConfig
766
+ class T5Gemma2TextEncoder(T5Gemma2PreTrainedModel):
767
+ config: T5Gemma2TextConfig
767
768
  _can_record_outputs = {
768
769
  "attentions": T5Gemma2SelfAttention,
769
770
  "hidden_states": T5Gemma2EncoderLayer,
@@ -771,96 +772,32 @@ class T5Gemma2Encoder(T5Gemma2PreTrainedModel):
771
772
 
772
773
  def __init__(
773
774
  self,
774
- config: T5Gemma2EncoderConfig,
775
+ config: T5Gemma2TextConfig,
775
776
  eoi_token_index: int = 256_000,
776
777
  ):
777
778
  super().__init__(config)
778
779
  self.padding_idx = config.pad_token_id
779
- self.vocab_size = config.text_config.vocab_size
780
-
781
- vision_config = config.vision_config
782
- text_config = config.text_config
783
-
784
- # setup vision tower
785
- self.vision_tower = AutoModel.from_config(config=vision_config)
786
- self.multi_modal_projector = T5Gemma2MultiModalProjector(config)
780
+ self.vocab_size = config.vocab_size
787
781
 
788
782
  self.embed_tokens = T5Gemma2TextScaledWordEmbedding(
789
- text_config.vocab_size,
790
- text_config.hidden_size,
783
+ config.vocab_size,
784
+ config.hidden_size,
791
785
  self.padding_idx,
792
- embed_scale=text_config.hidden_size**0.5,
786
+ embed_scale=config.hidden_size**0.5,
793
787
  eoi_token_index=eoi_token_index,
794
788
  )
795
- self.norm = T5Gemma2RMSNorm(text_config.hidden_size, eps=text_config.rms_norm_eps)
789
+ self.norm = T5Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
796
790
  self.gradient_checkpointing = False
797
791
 
798
792
  self.layers = nn.ModuleList(
799
- [T5Gemma2EncoderLayer(text_config, layer_idx) for layer_idx in range(text_config.num_hidden_layers)]
793
+ [T5Gemma2EncoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
800
794
  )
801
- self.dropout = nn.Dropout(text_config.dropout_rate)
802
- self.rotary_emb = T5Gemma2RotaryEmbedding(text_config)
803
-
804
- self.text_config = text_config
795
+ self.dropout = nn.Dropout(config.dropout_rate)
796
+ self.rotary_emb = T5Gemma2RotaryEmbedding(config)
805
797
 
806
798
  # Initialize weights and apply final processing
807
799
  self.post_init()
808
800
 
809
- def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
810
- """Convert pixel image to image features via the encoder and projector."""
811
- # pixel_values: (batch_size, channels, height, width)
812
- # image_features: Image feature tensor of shape (num_images, image_length, embed_dim).
813
- vision_outputs = self.vision_tower(pixel_values=pixel_values).last_hidden_state
814
- image_features = self.multi_modal_projector(vision_outputs)
815
- return image_features
816
-
817
- def get_image_placeholder_mask(
818
- self,
819
- input_ids: torch.LongTensor | None,
820
- inputs_embeds: torch.FloatTensor | None,
821
- image_features: torch.FloatTensor,
822
- ):
823
- """
824
- Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
825
- equal to the length of multimodal features. If the lengths are different, an error is raised.
826
- """
827
- image_token_id = self.config.image_token_id
828
- if input_ids is None:
829
- if inputs_embeds is None:
830
- raise ValueError("Either `input_ids` or `inputs_embeds` has to be provided.")
831
- special_image_mask = inputs_embeds == self.get_input_embeddings()(
832
- torch.tensor(image_token_id, dtype=torch.long, device=inputs_embeds.device)
833
- )
834
- special_image_mask = special_image_mask.all(-1)
835
- else:
836
- special_image_mask = input_ids == image_token_id
837
-
838
- n_image_tokens = special_image_mask.sum()
839
- special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
840
- n_image_features = image_features.shape[0] * image_features.shape[1]
841
- if inputs_embeds[special_image_mask].numel() != image_features.numel():
842
- raise ValueError(
843
- f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
844
- )
845
- return special_image_mask
846
-
847
- def preprocess_image_features(
848
- self,
849
- pixel_values: torch.Tensor,
850
- input_ids: torch.LongTensor | None = None,
851
- inputs_embeds: torch.FloatTensor | None = None,
852
- ):
853
- """Convert pixel images to image features and merge into input embeds."""
854
- image_features = self.get_image_features(pixel_values)
855
- image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
856
-
857
- image_mask = self.get_image_placeholder_mask(
858
- input_ids, inputs_embeds=inputs_embeds, image_features=image_features
859
- )
860
-
861
- inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_features)
862
- return inputs_embeds
863
-
864
801
  @check_model_inputs
865
802
  @auto_docstring
866
803
  def forward(
@@ -869,12 +806,10 @@ class T5Gemma2Encoder(T5Gemma2PreTrainedModel):
869
806
  attention_mask: torch.Tensor | None = None,
870
807
  position_ids: torch.LongTensor | None = None,
871
808
  inputs_embeds: torch.FloatTensor | None = None,
872
- pixel_values: torch.FloatTensor | None = None,
873
809
  # Unused for processor compatibility kept in signature.
874
810
  token_type_ids: torch.Tensor | None = None,
875
811
  **kwargs: Unpack[TransformersKwargs],
876
812
  ) -> BaseModelOutput:
877
- del token_type_ids
878
813
  if (input_ids is None) ^ (inputs_embeds is not None):
879
814
  raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
880
815
 
@@ -884,11 +819,6 @@ class T5Gemma2Encoder(T5Gemma2PreTrainedModel):
884
819
  if inputs_embeds is None:
885
820
  inputs_embeds = self.embed_tokens(input_ids)
886
821
 
887
- if pixel_values is not None:
888
- inputs_embeds = self.preprocess_image_features(
889
- pixel_values, input_ids=input_ids, inputs_embeds=inputs_embeds
890
- )
891
-
892
822
  if position_ids is None:
893
823
  position_ids = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0)
894
824
 
@@ -902,7 +832,7 @@ class T5Gemma2Encoder(T5Gemma2PreTrainedModel):
902
832
  "full_attention": create_bidirectional_mask(**mask_kwargs),
903
833
  "sliding_attention": create_bidirectional_mask(
904
834
  **mask_kwargs,
905
- and_mask_function=sliding_window_mask_function(self.text_config.sliding_window, is_causal=False),
835
+ and_mask_function=sliding_window_mask_function(self.config.sliding_window, is_causal=False),
906
836
  ),
907
837
  }
908
838
 
@@ -911,13 +841,13 @@ class T5Gemma2Encoder(T5Gemma2PreTrainedModel):
911
841
 
912
842
  # global and local position embeddings
913
843
  position_embeddings = {}
914
- for layer_type in self.text_config.layer_types:
844
+ for layer_type in self.config.layer_types:
915
845
  position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type)
916
846
 
917
847
  # dropout
918
848
  hidden_states = self.dropout(hidden_states)
919
849
 
920
- for layer_module in self.layers[: self.text_config.num_hidden_layers]:
850
+ for layer_module in self.layers[: self.config.num_hidden_layers]:
921
851
  hidden_states = layer_module(
922
852
  hidden_states,
923
853
  position_embeddings[layer_module.attention_type],
@@ -933,6 +863,114 @@ class T5Gemma2Encoder(T5Gemma2PreTrainedModel):
933
863
  )
934
864
 
935
865
 
866
+ class T5Gemma2Encoder(T5Gemma2PreTrainedModel):
867
+ config: T5Gemma2EncoderConfig
868
+
869
+ def __init__(
870
+ self,
871
+ config: T5Gemma2EncoderConfig,
872
+ eoi_token_index: int = 256_000,
873
+ ):
874
+ super().__init__(config)
875
+
876
+ self.text_model = T5Gemma2TextEncoder._from_config(config.text_config, eoi_token_index=eoi_token_index)
877
+ self.vision_tower = AutoModel.from_config(config=config.vision_config)
878
+ self.multi_modal_projector = T5Gemma2MultiModalProjector(config)
879
+
880
+ # Initialize weights and apply final processing
881
+ self.post_init()
882
+
883
+ def get_input_embeddings(self):
884
+ return self.text_model.get_input_embeddings()
885
+
886
+ def set_input_embeddings(self, new_embeddings):
887
+ return self.text_model.set_input_embeddings(new_embeddings)
888
+
889
+ @can_return_tuple
890
+ @auto_docstring
891
+ def get_image_features(
892
+ self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
893
+ ) -> tuple | BaseModelOutputWithPooling:
894
+ # pixel_values: (batch_size, channels, height, width)
895
+ # image_features: Image feature tensor of shape (num_images, image_length, embed_dim).
896
+ vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs)
897
+ last_hidden_state = vision_outputs.last_hidden_state
898
+ image_features = self.multi_modal_projector(last_hidden_state)
899
+ vision_outputs.pooler_output = image_features
900
+
901
+ return vision_outputs
902
+
903
+ def get_image_placeholder_mask(
904
+ self,
905
+ input_ids: torch.LongTensor | None,
906
+ inputs_embeds: torch.FloatTensor | None,
907
+ image_features: torch.FloatTensor,
908
+ ):
909
+ """
910
+ Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
911
+ equal to the length of multimodal features. If the lengths are different, an error is raised.
912
+ """
913
+ image_token_id = self.config.image_token_id
914
+ if input_ids is None:
915
+ if inputs_embeds is None:
916
+ raise ValueError("Either `input_ids` or `inputs_embeds` has to be provided.")
917
+ special_image_mask = inputs_embeds == self.get_input_embeddings()(
918
+ torch.tensor(image_token_id, dtype=torch.long, device=inputs_embeds.device)
919
+ )
920
+ special_image_mask = special_image_mask.all(-1)
921
+ else:
922
+ special_image_mask = input_ids == image_token_id
923
+
924
+ n_image_tokens = special_image_mask.sum()
925
+ special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
926
+ n_image_features = image_features.shape[0] * image_features.shape[1]
927
+ torch_compilable_check(
928
+ inputs_embeds[special_image_mask].numel() == image_features.numel(),
929
+ f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}",
930
+ )
931
+ return special_image_mask
932
+
933
+ @check_model_inputs
934
+ @auto_docstring
935
+ def forward(
936
+ self,
937
+ input_ids: torch.LongTensor | None = None,
938
+ attention_mask: torch.Tensor | None = None,
939
+ position_ids: torch.LongTensor | None = None,
940
+ inputs_embeds: torch.FloatTensor | None = None,
941
+ pixel_values: torch.FloatTensor | None = None,
942
+ # Unused for processor compatibility kept in signature.
943
+ token_type_ids: torch.Tensor | None = None,
944
+ **kwargs: Unpack[TransformersKwargs],
945
+ ) -> BaseModelOutput:
946
+ if (input_ids is None) ^ (inputs_embeds is not None):
947
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
948
+
949
+ if inputs_embeds is None:
950
+ inputs_embeds = self.text_model.embed_tokens(input_ids)
951
+
952
+ if pixel_values is not None:
953
+ image_features = self.get_image_features(pixel_values, return_dict=True).pooler_output
954
+ image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
955
+
956
+ image_mask = self.get_image_placeholder_mask(
957
+ input_ids, inputs_embeds=inputs_embeds, image_features=image_features
958
+ )
959
+
960
+ inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_features)
961
+
962
+ hidden_states = self.text_model(
963
+ inputs_embeds=inputs_embeds,
964
+ attention_mask=attention_mask,
965
+ position_ids=position_ids,
966
+ **kwargs,
967
+ )
968
+
969
+ return BaseModelOutput(
970
+ last_hidden_state=hidden_states,
971
+ )
972
+
973
+
936
974
  def bidirectional_mask_function(attention_mask: torch.Tensor | None) -> Callable:
937
975
  """
938
976
  This creates bidirectional attention mask.
@@ -1088,8 +1126,8 @@ class T5Gemma2Decoder(T5Gemma2PreTrainedModel):
1088
1126
  @auto_docstring
1089
1127
  class T5Gemma2Model(T5Gemma2PreTrainedModel):
1090
1128
  _tied_weights_keys = {
1091
- "decoder.embed_tokens.weight": "encoder.embed_tokens.weight",
1092
- "decoder.embed_tokens.eoi_embedding": "encoder.embed_tokens.eoi_embedding",
1129
+ "decoder.embed_tokens.weight": "encoder.text_model.embed_tokens.weight",
1130
+ "decoder.embed_tokens.eoi_embedding": "encoder.text_model.embed_tokens.eoi_embedding",
1093
1131
  }
1094
1132
 
1095
1133
  def __init__(self, config: T5Gemma2Config):
@@ -1183,9 +1221,9 @@ class T5Gemma2Model(T5Gemma2PreTrainedModel):
1183
1221
 
1184
1222
  class T5Gemma2ForConditionalGeneration(T5Gemma2PreTrainedModel, GenerationMixin):
1185
1223
  _tied_weights_keys = {
1186
- "lm_head.out_proj.weight": "model.encoder.embed_tokens.weight",
1224
+ "lm_head.out_proj.weight": "model.encoder.text_model.embed_tokens.weight",
1187
1225
  }
1188
- _tp_plan = {"lm_head.out_proj": "colwise_rep"}
1226
+ _tp_plan = {"lm_head.out_proj": "colwise_gather_output"}
1189
1227
  _pp_plan = {"lm_head.out_proj": (["hidden_states"], ["logits"])}
1190
1228
 
1191
1229
  def __init__(self, config: T5Gemma2Config):
@@ -1216,8 +1254,12 @@ class T5Gemma2ForConditionalGeneration(T5Gemma2PreTrainedModel, GenerationMixin)
1216
1254
  def get_decoder(self):
1217
1255
  return self.model.get_decoder()
1218
1256
 
1219
- def get_image_features(self, pixel_values):
1220
- return self.get_encoder().get_image_features(pixel_values)
1257
+ @can_return_tuple
1258
+ @auto_docstring
1259
+ def get_image_features(
1260
+ self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
1261
+ ) -> tuple | BaseModelOutputWithPooling:
1262
+ return self.get_encoder().get_image_features(pixel_values, **kwargs)
1221
1263
 
1222
1264
  @property
1223
1265
  def vision_tower(self):
@@ -1572,6 +1614,7 @@ class T5Gemma2ForTokenClassification(T5Gemma2PreTrainedModel):
1572
1614
  __all__ = [
1573
1615
  "T5Gemma2ForConditionalGeneration",
1574
1616
  "T5Gemma2Model",
1617
+ "T5Gemma2Encoder",
1575
1618
  "T5Gemma2PreTrainedModel",
1576
1619
  "T5Gemma2ForSequenceClassification",
1577
1620
  "T5Gemma2ForTokenClassification",