transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1021) hide show
  1. transformers/__init__.py +4 -11
  2. transformers/activations.py +2 -2
  3. transformers/backbone_utils.py +326 -0
  4. transformers/cache_utils.py +11 -2
  5. transformers/cli/serve.py +11 -8
  6. transformers/configuration_utils.py +1 -69
  7. transformers/conversion_mapping.py +146 -26
  8. transformers/convert_slow_tokenizer.py +6 -4
  9. transformers/core_model_loading.py +207 -118
  10. transformers/dependency_versions_check.py +0 -1
  11. transformers/dependency_versions_table.py +7 -8
  12. transformers/file_utils.py +0 -2
  13. transformers/generation/candidate_generator.py +1 -2
  14. transformers/generation/continuous_batching/cache.py +40 -38
  15. transformers/generation/continuous_batching/cache_manager.py +3 -16
  16. transformers/generation/continuous_batching/continuous_api.py +94 -406
  17. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  18. transformers/generation/continuous_batching/requests.py +54 -17
  19. transformers/generation/continuous_batching/scheduler.py +77 -95
  20. transformers/generation/logits_process.py +10 -5
  21. transformers/generation/stopping_criteria.py +1 -2
  22. transformers/generation/utils.py +75 -95
  23. transformers/image_processing_utils.py +0 -3
  24. transformers/image_processing_utils_fast.py +17 -18
  25. transformers/image_transforms.py +44 -13
  26. transformers/image_utils.py +0 -5
  27. transformers/initialization.py +57 -0
  28. transformers/integrations/__init__.py +10 -24
  29. transformers/integrations/accelerate.py +47 -11
  30. transformers/integrations/deepspeed.py +145 -3
  31. transformers/integrations/executorch.py +2 -6
  32. transformers/integrations/finegrained_fp8.py +142 -7
  33. transformers/integrations/flash_attention.py +2 -7
  34. transformers/integrations/hub_kernels.py +18 -7
  35. transformers/integrations/moe.py +226 -106
  36. transformers/integrations/mxfp4.py +47 -34
  37. transformers/integrations/peft.py +488 -176
  38. transformers/integrations/tensor_parallel.py +641 -581
  39. transformers/masking_utils.py +153 -9
  40. transformers/modeling_flash_attention_utils.py +1 -2
  41. transformers/modeling_utils.py +359 -358
  42. transformers/models/__init__.py +6 -0
  43. transformers/models/afmoe/configuration_afmoe.py +14 -4
  44. transformers/models/afmoe/modeling_afmoe.py +8 -8
  45. transformers/models/afmoe/modular_afmoe.py +7 -7
  46. transformers/models/aimv2/configuration_aimv2.py +2 -7
  47. transformers/models/aimv2/modeling_aimv2.py +26 -24
  48. transformers/models/aimv2/modular_aimv2.py +8 -12
  49. transformers/models/albert/configuration_albert.py +8 -1
  50. transformers/models/albert/modeling_albert.py +3 -3
  51. transformers/models/align/configuration_align.py +8 -5
  52. transformers/models/align/modeling_align.py +22 -24
  53. transformers/models/altclip/configuration_altclip.py +4 -6
  54. transformers/models/altclip/modeling_altclip.py +30 -26
  55. transformers/models/apertus/configuration_apertus.py +5 -7
  56. transformers/models/apertus/modeling_apertus.py +4 -4
  57. transformers/models/apertus/modular_apertus.py +8 -10
  58. transformers/models/arcee/configuration_arcee.py +5 -7
  59. transformers/models/arcee/modeling_arcee.py +4 -4
  60. transformers/models/aria/configuration_aria.py +11 -21
  61. transformers/models/aria/modeling_aria.py +39 -36
  62. transformers/models/aria/modular_aria.py +33 -39
  63. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
  64. transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
  65. transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
  66. transformers/models/auto/auto_factory.py +8 -6
  67. transformers/models/auto/configuration_auto.py +22 -0
  68. transformers/models/auto/image_processing_auto.py +17 -13
  69. transformers/models/auto/modeling_auto.py +15 -0
  70. transformers/models/auto/processing_auto.py +9 -18
  71. transformers/models/auto/tokenization_auto.py +17 -15
  72. transformers/models/autoformer/modeling_autoformer.py +2 -1
  73. transformers/models/aya_vision/configuration_aya_vision.py +4 -0
  74. transformers/models/aya_vision/modeling_aya_vision.py +29 -62
  75. transformers/models/aya_vision/modular_aya_vision.py +20 -45
  76. transformers/models/bamba/configuration_bamba.py +17 -7
  77. transformers/models/bamba/modeling_bamba.py +23 -55
  78. transformers/models/bamba/modular_bamba.py +19 -54
  79. transformers/models/bark/configuration_bark.py +2 -1
  80. transformers/models/bark/modeling_bark.py +24 -10
  81. transformers/models/bart/configuration_bart.py +9 -4
  82. transformers/models/bart/modeling_bart.py +9 -12
  83. transformers/models/beit/configuration_beit.py +2 -4
  84. transformers/models/beit/image_processing_beit_fast.py +3 -3
  85. transformers/models/beit/modeling_beit.py +14 -9
  86. transformers/models/bert/configuration_bert.py +12 -1
  87. transformers/models/bert/modeling_bert.py +6 -30
  88. transformers/models/bert_generation/configuration_bert_generation.py +17 -1
  89. transformers/models/bert_generation/modeling_bert_generation.py +6 -6
  90. transformers/models/big_bird/configuration_big_bird.py +12 -8
  91. transformers/models/big_bird/modeling_big_bird.py +0 -15
  92. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
  93. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
  94. transformers/models/biogpt/configuration_biogpt.py +8 -1
  95. transformers/models/biogpt/modeling_biogpt.py +4 -8
  96. transformers/models/biogpt/modular_biogpt.py +1 -5
  97. transformers/models/bit/configuration_bit.py +2 -4
  98. transformers/models/bit/modeling_bit.py +6 -5
  99. transformers/models/bitnet/configuration_bitnet.py +5 -7
  100. transformers/models/bitnet/modeling_bitnet.py +3 -4
  101. transformers/models/bitnet/modular_bitnet.py +3 -4
  102. transformers/models/blenderbot/configuration_blenderbot.py +8 -4
  103. transformers/models/blenderbot/modeling_blenderbot.py +4 -4
  104. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
  105. transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
  106. transformers/models/blip/configuration_blip.py +9 -9
  107. transformers/models/blip/modeling_blip.py +55 -37
  108. transformers/models/blip_2/configuration_blip_2.py +2 -1
  109. transformers/models/blip_2/modeling_blip_2.py +81 -56
  110. transformers/models/bloom/configuration_bloom.py +5 -1
  111. transformers/models/bloom/modeling_bloom.py +2 -1
  112. transformers/models/blt/configuration_blt.py +23 -12
  113. transformers/models/blt/modeling_blt.py +20 -14
  114. transformers/models/blt/modular_blt.py +70 -10
  115. transformers/models/bridgetower/configuration_bridgetower.py +7 -1
  116. transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
  117. transformers/models/bridgetower/modeling_bridgetower.py +29 -15
  118. transformers/models/bros/configuration_bros.py +24 -17
  119. transformers/models/camembert/configuration_camembert.py +8 -1
  120. transformers/models/camembert/modeling_camembert.py +6 -6
  121. transformers/models/canine/configuration_canine.py +4 -1
  122. transformers/models/chameleon/configuration_chameleon.py +5 -7
  123. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
  124. transformers/models/chameleon/modeling_chameleon.py +82 -36
  125. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
  126. transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
  127. transformers/models/clap/configuration_clap.py +4 -8
  128. transformers/models/clap/modeling_clap.py +21 -22
  129. transformers/models/clip/configuration_clip.py +4 -1
  130. transformers/models/clip/image_processing_clip_fast.py +9 -0
  131. transformers/models/clip/modeling_clip.py +25 -22
  132. transformers/models/clipseg/configuration_clipseg.py +4 -1
  133. transformers/models/clipseg/modeling_clipseg.py +27 -25
  134. transformers/models/clipseg/processing_clipseg.py +11 -3
  135. transformers/models/clvp/configuration_clvp.py +14 -2
  136. transformers/models/clvp/modeling_clvp.py +19 -30
  137. transformers/models/codegen/configuration_codegen.py +4 -3
  138. transformers/models/codegen/modeling_codegen.py +2 -1
  139. transformers/models/cohere/configuration_cohere.py +5 -7
  140. transformers/models/cohere/modeling_cohere.py +4 -4
  141. transformers/models/cohere/modular_cohere.py +3 -3
  142. transformers/models/cohere2/configuration_cohere2.py +6 -8
  143. transformers/models/cohere2/modeling_cohere2.py +4 -4
  144. transformers/models/cohere2/modular_cohere2.py +9 -11
  145. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  146. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
  147. transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
  148. transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
  149. transformers/models/colqwen2/modeling_colqwen2.py +7 -6
  150. transformers/models/colqwen2/modular_colqwen2.py +7 -6
  151. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
  152. transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
  153. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
  154. transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
  155. transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
  156. transformers/models/convbert/configuration_convbert.py +11 -7
  157. transformers/models/convnext/configuration_convnext.py +2 -4
  158. transformers/models/convnext/image_processing_convnext_fast.py +2 -2
  159. transformers/models/convnext/modeling_convnext.py +7 -6
  160. transformers/models/convnextv2/configuration_convnextv2.py +2 -4
  161. transformers/models/convnextv2/modeling_convnextv2.py +7 -6
  162. transformers/models/cpmant/configuration_cpmant.py +4 -0
  163. transformers/models/csm/configuration_csm.py +9 -15
  164. transformers/models/csm/modeling_csm.py +3 -3
  165. transformers/models/ctrl/configuration_ctrl.py +16 -0
  166. transformers/models/ctrl/modeling_ctrl.py +13 -25
  167. transformers/models/cwm/configuration_cwm.py +5 -7
  168. transformers/models/cwm/modeling_cwm.py +4 -4
  169. transformers/models/d_fine/configuration_d_fine.py +10 -56
  170. transformers/models/d_fine/modeling_d_fine.py +728 -868
  171. transformers/models/d_fine/modular_d_fine.py +335 -412
  172. transformers/models/dab_detr/configuration_dab_detr.py +22 -48
  173. transformers/models/dab_detr/modeling_dab_detr.py +11 -7
  174. transformers/models/dac/modeling_dac.py +1 -1
  175. transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
  176. transformers/models/data2vec/configuration_data2vec_text.py +11 -2
  177. transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
  178. transformers/models/data2vec/modeling_data2vec_text.py +6 -6
  179. transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
  180. transformers/models/dbrx/configuration_dbrx.py +11 -3
  181. transformers/models/dbrx/modeling_dbrx.py +6 -6
  182. transformers/models/dbrx/modular_dbrx.py +6 -6
  183. transformers/models/deberta/configuration_deberta.py +6 -0
  184. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
  185. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
  186. transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
  187. transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
  188. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
  189. transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
  190. transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
  191. transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
  192. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
  193. transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
  194. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
  195. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
  196. transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
  197. transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
  198. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
  199. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
  200. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
  201. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
  202. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
  203. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
  204. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
  205. transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
  206. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
  207. transformers/models/deit/modeling_deit.py +11 -7
  208. transformers/models/depth_anything/configuration_depth_anything.py +12 -42
  209. transformers/models/depth_anything/modeling_depth_anything.py +5 -3
  210. transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
  211. transformers/models/depth_pro/modeling_depth_pro.py +8 -4
  212. transformers/models/detr/configuration_detr.py +18 -49
  213. transformers/models/detr/image_processing_detr_fast.py +11 -11
  214. transformers/models/detr/modeling_detr.py +695 -734
  215. transformers/models/dia/configuration_dia.py +4 -7
  216. transformers/models/dia/generation_dia.py +8 -17
  217. transformers/models/dia/modeling_dia.py +7 -7
  218. transformers/models/dia/modular_dia.py +4 -4
  219. transformers/models/diffllama/configuration_diffllama.py +5 -7
  220. transformers/models/diffllama/modeling_diffllama.py +3 -8
  221. transformers/models/diffllama/modular_diffllama.py +2 -7
  222. transformers/models/dinat/configuration_dinat.py +2 -4
  223. transformers/models/dinat/modeling_dinat.py +7 -6
  224. transformers/models/dinov2/configuration_dinov2.py +2 -4
  225. transformers/models/dinov2/modeling_dinov2.py +9 -8
  226. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
  227. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
  228. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
  229. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
  230. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
  231. transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
  232. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
  233. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
  234. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
  235. transformers/models/distilbert/configuration_distilbert.py +8 -1
  236. transformers/models/distilbert/modeling_distilbert.py +3 -3
  237. transformers/models/doge/configuration_doge.py +17 -7
  238. transformers/models/doge/modeling_doge.py +4 -4
  239. transformers/models/doge/modular_doge.py +20 -10
  240. transformers/models/donut/image_processing_donut_fast.py +4 -4
  241. transformers/models/dots1/configuration_dots1.py +16 -7
  242. transformers/models/dots1/modeling_dots1.py +4 -4
  243. transformers/models/dpr/configuration_dpr.py +19 -1
  244. transformers/models/dpt/configuration_dpt.py +23 -65
  245. transformers/models/dpt/image_processing_dpt_fast.py +5 -5
  246. transformers/models/dpt/modeling_dpt.py +19 -15
  247. transformers/models/dpt/modular_dpt.py +4 -4
  248. transformers/models/edgetam/configuration_edgetam.py +1 -1
  249. transformers/models/edgetam/modeling_edgetam.py +53 -53
  250. transformers/models/edgetam/modular_edgetam.py +5 -7
  251. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
  252. transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
  253. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
  254. transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
  255. transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
  256. transformers/models/electra/configuration_electra.py +13 -2
  257. transformers/models/electra/modeling_electra.py +6 -6
  258. transformers/models/emu3/configuration_emu3.py +12 -10
  259. transformers/models/emu3/modeling_emu3.py +84 -47
  260. transformers/models/emu3/modular_emu3.py +77 -39
  261. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
  262. transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
  263. transformers/models/eomt/configuration_eomt.py +12 -13
  264. transformers/models/eomt/image_processing_eomt_fast.py +3 -3
  265. transformers/models/eomt/modeling_eomt.py +3 -3
  266. transformers/models/eomt/modular_eomt.py +17 -17
  267. transformers/models/eomt_dinov3/__init__.py +28 -0
  268. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  269. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  270. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  271. transformers/models/ernie/configuration_ernie.py +24 -2
  272. transformers/models/ernie/modeling_ernie.py +6 -30
  273. transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
  274. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  275. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
  276. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
  277. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
  278. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
  279. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
  280. transformers/models/esm/configuration_esm.py +9 -11
  281. transformers/models/esm/modeling_esm.py +3 -3
  282. transformers/models/esm/modeling_esmfold.py +1 -6
  283. transformers/models/esm/openfold_utils/protein.py +2 -3
  284. transformers/models/evolla/configuration_evolla.py +21 -8
  285. transformers/models/evolla/modeling_evolla.py +11 -7
  286. transformers/models/evolla/modular_evolla.py +5 -1
  287. transformers/models/exaone4/configuration_exaone4.py +8 -5
  288. transformers/models/exaone4/modeling_exaone4.py +4 -4
  289. transformers/models/exaone4/modular_exaone4.py +11 -8
  290. transformers/models/exaone_moe/__init__.py +27 -0
  291. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  292. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  293. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  294. transformers/models/falcon/configuration_falcon.py +9 -1
  295. transformers/models/falcon/modeling_falcon.py +3 -8
  296. transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
  297. transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
  298. transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
  299. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
  300. transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
  301. transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
  302. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
  303. transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
  304. transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
  305. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
  306. transformers/models/flaubert/configuration_flaubert.py +10 -4
  307. transformers/models/flaubert/modeling_flaubert.py +1 -1
  308. transformers/models/flava/configuration_flava.py +4 -3
  309. transformers/models/flava/image_processing_flava_fast.py +4 -4
  310. transformers/models/flava/modeling_flava.py +36 -28
  311. transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
  312. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
  313. transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
  314. transformers/models/florence2/configuration_florence2.py +4 -0
  315. transformers/models/florence2/modeling_florence2.py +57 -32
  316. transformers/models/florence2/modular_florence2.py +48 -26
  317. transformers/models/fnet/configuration_fnet.py +6 -1
  318. transformers/models/focalnet/configuration_focalnet.py +2 -4
  319. transformers/models/focalnet/modeling_focalnet.py +10 -7
  320. transformers/models/fsmt/configuration_fsmt.py +12 -16
  321. transformers/models/funnel/configuration_funnel.py +8 -0
  322. transformers/models/fuyu/configuration_fuyu.py +5 -8
  323. transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
  324. transformers/models/fuyu/modeling_fuyu.py +24 -23
  325. transformers/models/gemma/configuration_gemma.py +5 -7
  326. transformers/models/gemma/modeling_gemma.py +4 -4
  327. transformers/models/gemma/modular_gemma.py +5 -7
  328. transformers/models/gemma2/configuration_gemma2.py +5 -7
  329. transformers/models/gemma2/modeling_gemma2.py +4 -4
  330. transformers/models/gemma2/modular_gemma2.py +8 -10
  331. transformers/models/gemma3/configuration_gemma3.py +28 -22
  332. transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
  333. transformers/models/gemma3/modeling_gemma3.py +37 -33
  334. transformers/models/gemma3/modular_gemma3.py +46 -42
  335. transformers/models/gemma3n/configuration_gemma3n.py +35 -22
  336. transformers/models/gemma3n/modeling_gemma3n.py +86 -58
  337. transformers/models/gemma3n/modular_gemma3n.py +112 -75
  338. transformers/models/git/configuration_git.py +5 -7
  339. transformers/models/git/modeling_git.py +31 -41
  340. transformers/models/glm/configuration_glm.py +7 -9
  341. transformers/models/glm/modeling_glm.py +4 -4
  342. transformers/models/glm4/configuration_glm4.py +7 -9
  343. transformers/models/glm4/modeling_glm4.py +4 -4
  344. transformers/models/glm46v/configuration_glm46v.py +4 -0
  345. transformers/models/glm46v/image_processing_glm46v.py +5 -2
  346. transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
  347. transformers/models/glm46v/modeling_glm46v.py +91 -46
  348. transformers/models/glm46v/modular_glm46v.py +4 -0
  349. transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
  350. transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
  351. transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
  352. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
  353. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
  354. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
  355. transformers/models/glm4v/configuration_glm4v.py +12 -8
  356. transformers/models/glm4v/image_processing_glm4v.py +5 -2
  357. transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
  358. transformers/models/glm4v/modeling_glm4v.py +120 -63
  359. transformers/models/glm4v/modular_glm4v.py +82 -50
  360. transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
  361. transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
  362. transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
  363. transformers/models/glm_image/configuration_glm_image.py +26 -20
  364. transformers/models/glm_image/image_processing_glm_image.py +1 -1
  365. transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
  366. transformers/models/glm_image/modeling_glm_image.py +337 -236
  367. transformers/models/glm_image/modular_glm_image.py +415 -255
  368. transformers/models/glm_image/processing_glm_image.py +65 -17
  369. transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
  370. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  371. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  372. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  373. transformers/models/glmasr/modeling_glmasr.py +34 -28
  374. transformers/models/glmasr/modular_glmasr.py +23 -11
  375. transformers/models/glpn/image_processing_glpn_fast.py +3 -3
  376. transformers/models/glpn/modeling_glpn.py +4 -2
  377. transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
  378. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
  379. transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
  380. transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
  381. transformers/models/gpt2/configuration_gpt2.py +13 -1
  382. transformers/models/gpt2/modeling_gpt2.py +5 -5
  383. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
  384. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
  385. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
  386. transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
  387. transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
  388. transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
  389. transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
  390. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
  391. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
  392. transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
  393. transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
  394. transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
  395. transformers/models/gptj/configuration_gptj.py +4 -4
  396. transformers/models/gptj/modeling_gptj.py +3 -7
  397. transformers/models/granite/configuration_granite.py +5 -7
  398. transformers/models/granite/modeling_granite.py +4 -4
  399. transformers/models/granite_speech/modeling_granite_speech.py +63 -37
  400. transformers/models/granitemoe/configuration_granitemoe.py +5 -7
  401. transformers/models/granitemoe/modeling_granitemoe.py +4 -4
  402. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
  403. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
  404. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
  405. transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
  406. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
  407. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
  408. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
  409. transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
  410. transformers/models/groupvit/configuration_groupvit.py +4 -1
  411. transformers/models/groupvit/modeling_groupvit.py +29 -22
  412. transformers/models/helium/configuration_helium.py +5 -7
  413. transformers/models/helium/modeling_helium.py +4 -4
  414. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
  415. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
  416. transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
  417. transformers/models/hiera/configuration_hiera.py +2 -4
  418. transformers/models/hiera/modeling_hiera.py +11 -8
  419. transformers/models/hubert/configuration_hubert.py +4 -1
  420. transformers/models/hubert/modeling_hubert.py +7 -4
  421. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
  422. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
  423. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
  424. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
  425. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
  426. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
  427. transformers/models/ibert/configuration_ibert.py +4 -1
  428. transformers/models/idefics/configuration_idefics.py +5 -7
  429. transformers/models/idefics/modeling_idefics.py +3 -4
  430. transformers/models/idefics/vision.py +5 -4
  431. transformers/models/idefics2/configuration_idefics2.py +1 -2
  432. transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
  433. transformers/models/idefics2/modeling_idefics2.py +72 -50
  434. transformers/models/idefics3/configuration_idefics3.py +1 -3
  435. transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
  436. transformers/models/idefics3/modeling_idefics3.py +63 -40
  437. transformers/models/ijepa/modeling_ijepa.py +3 -3
  438. transformers/models/imagegpt/configuration_imagegpt.py +9 -1
  439. transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
  440. transformers/models/imagegpt/modeling_imagegpt.py +8 -4
  441. transformers/models/informer/modeling_informer.py +3 -3
  442. transformers/models/instructblip/configuration_instructblip.py +2 -1
  443. transformers/models/instructblip/modeling_instructblip.py +65 -39
  444. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
  445. transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
  446. transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
  447. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
  448. transformers/models/internvl/configuration_internvl.py +5 -0
  449. transformers/models/internvl/modeling_internvl.py +35 -55
  450. transformers/models/internvl/modular_internvl.py +26 -38
  451. transformers/models/internvl/video_processing_internvl.py +2 -2
  452. transformers/models/jais2/configuration_jais2.py +5 -7
  453. transformers/models/jais2/modeling_jais2.py +4 -4
  454. transformers/models/jamba/configuration_jamba.py +5 -7
  455. transformers/models/jamba/modeling_jamba.py +4 -4
  456. transformers/models/jamba/modular_jamba.py +3 -3
  457. transformers/models/janus/image_processing_janus.py +2 -2
  458. transformers/models/janus/image_processing_janus_fast.py +8 -8
  459. transformers/models/janus/modeling_janus.py +63 -146
  460. transformers/models/janus/modular_janus.py +62 -20
  461. transformers/models/jetmoe/configuration_jetmoe.py +6 -4
  462. transformers/models/jetmoe/modeling_jetmoe.py +3 -3
  463. transformers/models/jetmoe/modular_jetmoe.py +3 -3
  464. transformers/models/kosmos2/configuration_kosmos2.py +10 -8
  465. transformers/models/kosmos2/modeling_kosmos2.py +56 -34
  466. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
  467. transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
  468. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
  469. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
  470. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
  471. transformers/models/lasr/configuration_lasr.py +2 -4
  472. transformers/models/lasr/modeling_lasr.py +3 -3
  473. transformers/models/lasr/modular_lasr.py +3 -3
  474. transformers/models/layoutlm/configuration_layoutlm.py +14 -1
  475. transformers/models/layoutlm/modeling_layoutlm.py +3 -3
  476. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
  477. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
  478. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
  479. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
  480. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
  481. transformers/models/led/configuration_led.py +7 -8
  482. transformers/models/levit/image_processing_levit_fast.py +4 -4
  483. transformers/models/lfm2/configuration_lfm2.py +5 -7
  484. transformers/models/lfm2/modeling_lfm2.py +4 -4
  485. transformers/models/lfm2/modular_lfm2.py +3 -3
  486. transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
  487. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
  488. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  489. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
  490. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
  491. transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
  492. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
  493. transformers/models/lightglue/modeling_lightglue.py +3 -3
  494. transformers/models/lightglue/modular_lightglue.py +3 -3
  495. transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
  496. transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
  497. transformers/models/lilt/configuration_lilt.py +6 -1
  498. transformers/models/llama/configuration_llama.py +5 -7
  499. transformers/models/llama/modeling_llama.py +4 -4
  500. transformers/models/llama4/configuration_llama4.py +67 -47
  501. transformers/models/llama4/image_processing_llama4_fast.py +3 -3
  502. transformers/models/llama4/modeling_llama4.py +46 -44
  503. transformers/models/llava/configuration_llava.py +10 -0
  504. transformers/models/llava/image_processing_llava_fast.py +3 -3
  505. transformers/models/llava/modeling_llava.py +38 -65
  506. transformers/models/llava_next/configuration_llava_next.py +2 -1
  507. transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
  508. transformers/models/llava_next/modeling_llava_next.py +61 -60
  509. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
  510. transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
  511. transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
  512. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
  513. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
  514. transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
  515. transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
  516. transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
  517. transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
  518. transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
  519. transformers/models/longformer/configuration_longformer.py +4 -1
  520. transformers/models/longt5/configuration_longt5.py +9 -6
  521. transformers/models/longt5/modeling_longt5.py +2 -1
  522. transformers/models/luke/configuration_luke.py +8 -1
  523. transformers/models/lw_detr/configuration_lw_detr.py +19 -31
  524. transformers/models/lw_detr/modeling_lw_detr.py +43 -44
  525. transformers/models/lw_detr/modular_lw_detr.py +36 -38
  526. transformers/models/lxmert/configuration_lxmert.py +16 -0
  527. transformers/models/m2m_100/configuration_m2m_100.py +7 -8
  528. transformers/models/m2m_100/modeling_m2m_100.py +3 -3
  529. transformers/models/mamba/configuration_mamba.py +5 -2
  530. transformers/models/mamba/modeling_mamba.py +18 -26
  531. transformers/models/mamba2/configuration_mamba2.py +5 -7
  532. transformers/models/mamba2/modeling_mamba2.py +22 -33
  533. transformers/models/marian/configuration_marian.py +10 -4
  534. transformers/models/marian/modeling_marian.py +4 -4
  535. transformers/models/markuplm/configuration_markuplm.py +4 -6
  536. transformers/models/markuplm/modeling_markuplm.py +3 -3
  537. transformers/models/mask2former/configuration_mask2former.py +12 -47
  538. transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
  539. transformers/models/mask2former/modeling_mask2former.py +18 -12
  540. transformers/models/maskformer/configuration_maskformer.py +14 -45
  541. transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
  542. transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
  543. transformers/models/maskformer/modeling_maskformer.py +15 -9
  544. transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
  545. transformers/models/mbart/configuration_mbart.py +9 -4
  546. transformers/models/mbart/modeling_mbart.py +9 -6
  547. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
  548. transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
  549. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  550. transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
  551. transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
  552. transformers/models/mgp_str/modeling_mgp_str.py +4 -2
  553. transformers/models/mimi/configuration_mimi.py +4 -0
  554. transformers/models/mimi/modeling_mimi.py +40 -36
  555. transformers/models/minimax/configuration_minimax.py +8 -11
  556. transformers/models/minimax/modeling_minimax.py +5 -5
  557. transformers/models/minimax/modular_minimax.py +9 -12
  558. transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
  559. transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
  560. transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
  561. transformers/models/ministral/configuration_ministral.py +5 -7
  562. transformers/models/ministral/modeling_ministral.py +4 -4
  563. transformers/models/ministral/modular_ministral.py +5 -8
  564. transformers/models/ministral3/configuration_ministral3.py +4 -4
  565. transformers/models/ministral3/modeling_ministral3.py +4 -4
  566. transformers/models/ministral3/modular_ministral3.py +3 -3
  567. transformers/models/mistral/configuration_mistral.py +5 -7
  568. transformers/models/mistral/modeling_mistral.py +4 -4
  569. transformers/models/mistral/modular_mistral.py +3 -3
  570. transformers/models/mistral3/configuration_mistral3.py +4 -0
  571. transformers/models/mistral3/modeling_mistral3.py +36 -40
  572. transformers/models/mistral3/modular_mistral3.py +31 -32
  573. transformers/models/mixtral/configuration_mixtral.py +8 -11
  574. transformers/models/mixtral/modeling_mixtral.py +4 -4
  575. transformers/models/mlcd/modeling_mlcd.py +7 -5
  576. transformers/models/mlcd/modular_mlcd.py +7 -5
  577. transformers/models/mllama/configuration_mllama.py +5 -7
  578. transformers/models/mllama/image_processing_mllama_fast.py +6 -5
  579. transformers/models/mllama/modeling_mllama.py +19 -19
  580. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
  581. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
  582. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
  583. transformers/models/mobilebert/configuration_mobilebert.py +4 -1
  584. transformers/models/mobilebert/modeling_mobilebert.py +3 -3
  585. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
  586. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
  587. transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
  588. transformers/models/mobilevit/modeling_mobilevit.py +4 -2
  589. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
  590. transformers/models/modernbert/configuration_modernbert.py +46 -21
  591. transformers/models/modernbert/modeling_modernbert.py +146 -899
  592. transformers/models/modernbert/modular_modernbert.py +185 -908
  593. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
  594. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
  595. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
  596. transformers/models/moonshine/configuration_moonshine.py +12 -7
  597. transformers/models/moonshine/modeling_moonshine.py +7 -7
  598. transformers/models/moonshine/modular_moonshine.py +19 -13
  599. transformers/models/moshi/configuration_moshi.py +28 -2
  600. transformers/models/moshi/modeling_moshi.py +4 -9
  601. transformers/models/mpnet/configuration_mpnet.py +6 -1
  602. transformers/models/mpt/configuration_mpt.py +16 -0
  603. transformers/models/mra/configuration_mra.py +8 -1
  604. transformers/models/mt5/configuration_mt5.py +9 -5
  605. transformers/models/mt5/modeling_mt5.py +5 -8
  606. transformers/models/musicgen/configuration_musicgen.py +12 -7
  607. transformers/models/musicgen/modeling_musicgen.py +6 -5
  608. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
  609. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
  610. transformers/models/mvp/configuration_mvp.py +8 -4
  611. transformers/models/mvp/modeling_mvp.py +6 -4
  612. transformers/models/nanochat/configuration_nanochat.py +5 -7
  613. transformers/models/nanochat/modeling_nanochat.py +4 -4
  614. transformers/models/nanochat/modular_nanochat.py +4 -4
  615. transformers/models/nemotron/configuration_nemotron.py +5 -7
  616. transformers/models/nemotron/modeling_nemotron.py +4 -14
  617. transformers/models/nllb/tokenization_nllb.py +7 -5
  618. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
  619. transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
  620. transformers/models/nougat/image_processing_nougat_fast.py +8 -8
  621. transformers/models/nystromformer/configuration_nystromformer.py +8 -1
  622. transformers/models/olmo/configuration_olmo.py +5 -7
  623. transformers/models/olmo/modeling_olmo.py +4 -4
  624. transformers/models/olmo/modular_olmo.py +3 -3
  625. transformers/models/olmo2/configuration_olmo2.py +9 -11
  626. transformers/models/olmo2/modeling_olmo2.py +4 -4
  627. transformers/models/olmo2/modular_olmo2.py +7 -7
  628. transformers/models/olmo3/configuration_olmo3.py +10 -11
  629. transformers/models/olmo3/modeling_olmo3.py +4 -4
  630. transformers/models/olmo3/modular_olmo3.py +13 -14
  631. transformers/models/olmoe/configuration_olmoe.py +5 -7
  632. transformers/models/olmoe/modeling_olmoe.py +4 -4
  633. transformers/models/olmoe/modular_olmoe.py +3 -3
  634. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
  635. transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
  636. transformers/models/oneformer/configuration_oneformer.py +9 -46
  637. transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
  638. transformers/models/oneformer/modeling_oneformer.py +14 -9
  639. transformers/models/openai/configuration_openai.py +16 -0
  640. transformers/models/opt/configuration_opt.py +6 -6
  641. transformers/models/opt/modeling_opt.py +5 -5
  642. transformers/models/ovis2/configuration_ovis2.py +4 -0
  643. transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
  644. transformers/models/ovis2/modeling_ovis2.py +58 -99
  645. transformers/models/ovis2/modular_ovis2.py +52 -13
  646. transformers/models/owlv2/configuration_owlv2.py +4 -1
  647. transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
  648. transformers/models/owlv2/modeling_owlv2.py +40 -27
  649. transformers/models/owlv2/modular_owlv2.py +5 -5
  650. transformers/models/owlvit/configuration_owlvit.py +4 -1
  651. transformers/models/owlvit/modeling_owlvit.py +40 -27
  652. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
  653. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
  654. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
  655. transformers/models/paligemma/configuration_paligemma.py +4 -0
  656. transformers/models/paligemma/modeling_paligemma.py +30 -26
  657. transformers/models/parakeet/configuration_parakeet.py +2 -4
  658. transformers/models/parakeet/modeling_parakeet.py +3 -3
  659. transformers/models/parakeet/modular_parakeet.py +3 -3
  660. transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
  661. transformers/models/patchtst/modeling_patchtst.py +3 -3
  662. transformers/models/pe_audio/modeling_pe_audio.py +4 -4
  663. transformers/models/pe_audio/modular_pe_audio.py +1 -1
  664. transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
  665. transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
  666. transformers/models/pe_video/modeling_pe_video.py +36 -24
  667. transformers/models/pe_video/modular_pe_video.py +36 -23
  668. transformers/models/pegasus/configuration_pegasus.py +8 -5
  669. transformers/models/pegasus/modeling_pegasus.py +4 -4
  670. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
  671. transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
  672. transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
  673. transformers/models/perceiver/modeling_perceiver.py +17 -9
  674. transformers/models/perception_lm/modeling_perception_lm.py +26 -27
  675. transformers/models/perception_lm/modular_perception_lm.py +27 -25
  676. transformers/models/persimmon/configuration_persimmon.py +5 -7
  677. transformers/models/persimmon/modeling_persimmon.py +5 -5
  678. transformers/models/phi/configuration_phi.py +8 -6
  679. transformers/models/phi/modeling_phi.py +4 -4
  680. transformers/models/phi/modular_phi.py +3 -3
  681. transformers/models/phi3/configuration_phi3.py +9 -11
  682. transformers/models/phi3/modeling_phi3.py +4 -4
  683. transformers/models/phi3/modular_phi3.py +3 -3
  684. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
  685. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
  686. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
  687. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
  688. transformers/models/phimoe/configuration_phimoe.py +5 -7
  689. transformers/models/phimoe/modeling_phimoe.py +15 -39
  690. transformers/models/phimoe/modular_phimoe.py +12 -7
  691. transformers/models/pix2struct/configuration_pix2struct.py +12 -9
  692. transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
  693. transformers/models/pix2struct/modeling_pix2struct.py +14 -7
  694. transformers/models/pixio/configuration_pixio.py +2 -4
  695. transformers/models/pixio/modeling_pixio.py +9 -8
  696. transformers/models/pixio/modular_pixio.py +4 -2
  697. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
  698. transformers/models/pixtral/modeling_pixtral.py +9 -12
  699. transformers/models/plbart/configuration_plbart.py +8 -5
  700. transformers/models/plbart/modeling_plbart.py +9 -7
  701. transformers/models/plbart/modular_plbart.py +1 -1
  702. transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
  703. transformers/models/pop2piano/configuration_pop2piano.py +7 -6
  704. transformers/models/pop2piano/modeling_pop2piano.py +2 -1
  705. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  706. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  707. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  708. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  709. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  710. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
  711. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
  712. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
  713. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
  714. transformers/models/prophetnet/configuration_prophetnet.py +11 -10
  715. transformers/models/prophetnet/modeling_prophetnet.py +12 -23
  716. transformers/models/pvt/image_processing_pvt.py +7 -7
  717. transformers/models/pvt/image_processing_pvt_fast.py +1 -1
  718. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  719. transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
  720. transformers/models/qwen2/configuration_qwen2.py +14 -4
  721. transformers/models/qwen2/modeling_qwen2.py +4 -4
  722. transformers/models/qwen2/modular_qwen2.py +3 -3
  723. transformers/models/qwen2/tokenization_qwen2.py +0 -4
  724. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
  725. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
  726. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
  727. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
  728. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
  729. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
  730. transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
  731. transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
  732. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  733. transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
  734. transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
  735. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
  736. transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
  737. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
  738. transformers/models/qwen3/configuration_qwen3.py +15 -5
  739. transformers/models/qwen3/modeling_qwen3.py +4 -4
  740. transformers/models/qwen3/modular_qwen3.py +3 -3
  741. transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
  742. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  743. transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
  744. transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
  745. transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
  746. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
  747. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
  748. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
  749. transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
  750. transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
  751. transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
  752. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
  753. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
  754. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
  755. transformers/models/rag/configuration_rag.py +6 -6
  756. transformers/models/rag/modeling_rag.py +3 -3
  757. transformers/models/rag/retrieval_rag.py +1 -1
  758. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
  759. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
  760. transformers/models/reformer/configuration_reformer.py +7 -7
  761. transformers/models/rembert/configuration_rembert.py +8 -1
  762. transformers/models/rembert/modeling_rembert.py +0 -22
  763. transformers/models/resnet/configuration_resnet.py +2 -4
  764. transformers/models/resnet/modeling_resnet.py +6 -5
  765. transformers/models/roberta/configuration_roberta.py +11 -2
  766. transformers/models/roberta/modeling_roberta.py +6 -6
  767. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
  768. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
  769. transformers/models/roc_bert/configuration_roc_bert.py +8 -1
  770. transformers/models/roc_bert/modeling_roc_bert.py +6 -41
  771. transformers/models/roformer/configuration_roformer.py +13 -2
  772. transformers/models/roformer/modeling_roformer.py +0 -14
  773. transformers/models/rt_detr/configuration_rt_detr.py +8 -49
  774. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
  775. transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
  776. transformers/models/rt_detr/modeling_rt_detr.py +578 -737
  777. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
  778. transformers/models/rt_detr/modular_rt_detr.py +1508 -6
  779. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
  780. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
  781. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
  782. transformers/models/rwkv/configuration_rwkv.py +2 -3
  783. transformers/models/rwkv/modeling_rwkv.py +0 -23
  784. transformers/models/sam/configuration_sam.py +2 -0
  785. transformers/models/sam/image_processing_sam_fast.py +4 -4
  786. transformers/models/sam/modeling_sam.py +13 -8
  787. transformers/models/sam/processing_sam.py +3 -3
  788. transformers/models/sam2/configuration_sam2.py +1 -1
  789. transformers/models/sam2/modeling_sam2.py +56 -52
  790. transformers/models/sam2/modular_sam2.py +47 -55
  791. transformers/models/sam2_video/modeling_sam2_video.py +50 -51
  792. transformers/models/sam2_video/modular_sam2_video.py +12 -10
  793. transformers/models/sam3/modeling_sam3.py +43 -47
  794. transformers/models/sam3/processing_sam3.py +8 -4
  795. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
  796. transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
  797. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  798. transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
  799. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
  800. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
  801. transformers/models/sam3_video/modeling_sam3_video.py +27 -14
  802. transformers/models/sam_hq/configuration_sam_hq.py +2 -0
  803. transformers/models/sam_hq/modeling_sam_hq.py +13 -9
  804. transformers/models/sam_hq/modular_sam_hq.py +6 -6
  805. transformers/models/sam_hq/processing_sam_hq.py +7 -6
  806. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
  807. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
  808. transformers/models/seed_oss/configuration_seed_oss.py +7 -9
  809. transformers/models/seed_oss/modeling_seed_oss.py +4 -4
  810. transformers/models/seed_oss/modular_seed_oss.py +3 -3
  811. transformers/models/segformer/image_processing_segformer_fast.py +4 -4
  812. transformers/models/segformer/modeling_segformer.py +4 -2
  813. transformers/models/segformer/modular_segformer.py +3 -3
  814. transformers/models/seggpt/modeling_seggpt.py +20 -8
  815. transformers/models/sew/configuration_sew.py +4 -1
  816. transformers/models/sew/modeling_sew.py +9 -5
  817. transformers/models/sew/modular_sew.py +2 -1
  818. transformers/models/sew_d/configuration_sew_d.py +4 -1
  819. transformers/models/sew_d/modeling_sew_d.py +4 -1
  820. transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
  821. transformers/models/siglip/configuration_siglip.py +4 -1
  822. transformers/models/siglip/modeling_siglip.py +27 -71
  823. transformers/models/siglip2/__init__.py +1 -0
  824. transformers/models/siglip2/configuration_siglip2.py +4 -2
  825. transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
  826. transformers/models/siglip2/modeling_siglip2.py +37 -78
  827. transformers/models/siglip2/modular_siglip2.py +74 -25
  828. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  829. transformers/models/smollm3/configuration_smollm3.py +6 -6
  830. transformers/models/smollm3/modeling_smollm3.py +4 -4
  831. transformers/models/smollm3/modular_smollm3.py +9 -9
  832. transformers/models/smolvlm/configuration_smolvlm.py +1 -3
  833. transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
  834. transformers/models/smolvlm/modeling_smolvlm.py +75 -46
  835. transformers/models/smolvlm/modular_smolvlm.py +36 -23
  836. transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
  837. transformers/models/solar_open/__init__.py +27 -0
  838. transformers/models/solar_open/configuration_solar_open.py +184 -0
  839. transformers/models/solar_open/modeling_solar_open.py +642 -0
  840. transformers/models/solar_open/modular_solar_open.py +224 -0
  841. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
  842. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
  843. transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
  844. transformers/models/speecht5/configuration_speecht5.py +7 -8
  845. transformers/models/splinter/configuration_splinter.py +6 -6
  846. transformers/models/splinter/modeling_splinter.py +8 -3
  847. transformers/models/squeezebert/configuration_squeezebert.py +14 -1
  848. transformers/models/stablelm/configuration_stablelm.py +8 -6
  849. transformers/models/stablelm/modeling_stablelm.py +5 -5
  850. transformers/models/starcoder2/configuration_starcoder2.py +11 -5
  851. transformers/models/starcoder2/modeling_starcoder2.py +5 -5
  852. transformers/models/starcoder2/modular_starcoder2.py +4 -4
  853. transformers/models/superglue/configuration_superglue.py +4 -0
  854. transformers/models/superglue/image_processing_superglue_fast.py +4 -3
  855. transformers/models/superglue/modeling_superglue.py +9 -4
  856. transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
  857. transformers/models/superpoint/modeling_superpoint.py +4 -2
  858. transformers/models/swin/configuration_swin.py +2 -4
  859. transformers/models/swin/modeling_swin.py +11 -8
  860. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
  861. transformers/models/swin2sr/modeling_swin2sr.py +4 -2
  862. transformers/models/swinv2/configuration_swinv2.py +2 -4
  863. transformers/models/swinv2/modeling_swinv2.py +10 -7
  864. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
  865. transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
  866. transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
  867. transformers/models/t5/configuration_t5.py +9 -8
  868. transformers/models/t5/modeling_t5.py +5 -8
  869. transformers/models/t5gemma/configuration_t5gemma.py +10 -25
  870. transformers/models/t5gemma/modeling_t5gemma.py +9 -9
  871. transformers/models/t5gemma/modular_t5gemma.py +11 -24
  872. transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
  873. transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
  874. transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
  875. transformers/models/table_transformer/configuration_table_transformer.py +18 -49
  876. transformers/models/table_transformer/modeling_table_transformer.py +27 -53
  877. transformers/models/tapas/configuration_tapas.py +12 -1
  878. transformers/models/tapas/modeling_tapas.py +1 -1
  879. transformers/models/tapas/tokenization_tapas.py +1 -0
  880. transformers/models/textnet/configuration_textnet.py +4 -6
  881. transformers/models/textnet/image_processing_textnet_fast.py +3 -3
  882. transformers/models/textnet/modeling_textnet.py +15 -14
  883. transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
  884. transformers/models/timesfm/modeling_timesfm.py +5 -6
  885. transformers/models/timesfm/modular_timesfm.py +5 -6
  886. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
  887. transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
  888. transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
  889. transformers/models/trocr/configuration_trocr.py +11 -7
  890. transformers/models/trocr/modeling_trocr.py +4 -2
  891. transformers/models/tvp/configuration_tvp.py +10 -35
  892. transformers/models/tvp/image_processing_tvp_fast.py +6 -5
  893. transformers/models/tvp/modeling_tvp.py +1 -1
  894. transformers/models/udop/configuration_udop.py +16 -7
  895. transformers/models/udop/modeling_udop.py +10 -6
  896. transformers/models/umt5/configuration_umt5.py +8 -6
  897. transformers/models/umt5/modeling_umt5.py +7 -3
  898. transformers/models/unispeech/configuration_unispeech.py +4 -1
  899. transformers/models/unispeech/modeling_unispeech.py +7 -4
  900. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
  901. transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
  902. transformers/models/upernet/configuration_upernet.py +8 -35
  903. transformers/models/upernet/modeling_upernet.py +1 -1
  904. transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
  905. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  906. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  907. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
  908. transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
  909. transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
  910. transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
  911. transformers/models/video_llava/configuration_video_llava.py +4 -0
  912. transformers/models/video_llava/modeling_video_llava.py +87 -89
  913. transformers/models/videomae/modeling_videomae.py +4 -5
  914. transformers/models/vilt/configuration_vilt.py +4 -1
  915. transformers/models/vilt/image_processing_vilt_fast.py +6 -6
  916. transformers/models/vilt/modeling_vilt.py +27 -12
  917. transformers/models/vipllava/configuration_vipllava.py +4 -0
  918. transformers/models/vipllava/modeling_vipllava.py +57 -31
  919. transformers/models/vipllava/modular_vipllava.py +50 -24
  920. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
  921. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
  922. transformers/models/visual_bert/configuration_visual_bert.py +6 -1
  923. transformers/models/vit/configuration_vit.py +2 -2
  924. transformers/models/vit/modeling_vit.py +7 -5
  925. transformers/models/vit_mae/modeling_vit_mae.py +11 -7
  926. transformers/models/vit_msn/modeling_vit_msn.py +11 -7
  927. transformers/models/vitdet/configuration_vitdet.py +2 -4
  928. transformers/models/vitdet/modeling_vitdet.py +2 -3
  929. transformers/models/vitmatte/configuration_vitmatte.py +6 -35
  930. transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
  931. transformers/models/vitmatte/modeling_vitmatte.py +1 -1
  932. transformers/models/vitpose/configuration_vitpose.py +6 -43
  933. transformers/models/vitpose/modeling_vitpose.py +5 -3
  934. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
  935. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
  936. transformers/models/vits/configuration_vits.py +4 -0
  937. transformers/models/vits/modeling_vits.py +9 -7
  938. transformers/models/vivit/modeling_vivit.py +4 -4
  939. transformers/models/vjepa2/modeling_vjepa2.py +9 -9
  940. transformers/models/voxtral/configuration_voxtral.py +0 -1
  941. transformers/models/voxtral/modeling_voxtral.py +25 -24
  942. transformers/models/voxtral/modular_voxtral.py +26 -20
  943. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
  944. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
  945. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
  946. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
  947. transformers/models/wavlm/configuration_wavlm.py +4 -1
  948. transformers/models/wavlm/modeling_wavlm.py +4 -1
  949. transformers/models/whisper/configuration_whisper.py +6 -4
  950. transformers/models/whisper/generation_whisper.py +0 -1
  951. transformers/models/whisper/modeling_whisper.py +3 -3
  952. transformers/models/x_clip/configuration_x_clip.py +4 -1
  953. transformers/models/x_clip/modeling_x_clip.py +26 -27
  954. transformers/models/xglm/configuration_xglm.py +9 -7
  955. transformers/models/xlm/configuration_xlm.py +10 -7
  956. transformers/models/xlm/modeling_xlm.py +1 -1
  957. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
  958. transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
  959. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
  960. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
  961. transformers/models/xlnet/configuration_xlnet.py +3 -1
  962. transformers/models/xlstm/configuration_xlstm.py +5 -7
  963. transformers/models/xlstm/modeling_xlstm.py +0 -32
  964. transformers/models/xmod/configuration_xmod.py +11 -2
  965. transformers/models/xmod/modeling_xmod.py +13 -16
  966. transformers/models/yolos/image_processing_yolos_fast.py +25 -28
  967. transformers/models/yolos/modeling_yolos.py +7 -7
  968. transformers/models/yolos/modular_yolos.py +16 -16
  969. transformers/models/yoso/configuration_yoso.py +8 -1
  970. transformers/models/youtu/__init__.py +27 -0
  971. transformers/models/youtu/configuration_youtu.py +194 -0
  972. transformers/models/youtu/modeling_youtu.py +619 -0
  973. transformers/models/youtu/modular_youtu.py +254 -0
  974. transformers/models/zamba/configuration_zamba.py +5 -7
  975. transformers/models/zamba/modeling_zamba.py +25 -56
  976. transformers/models/zamba2/configuration_zamba2.py +8 -13
  977. transformers/models/zamba2/modeling_zamba2.py +53 -78
  978. transformers/models/zamba2/modular_zamba2.py +36 -29
  979. transformers/models/zoedepth/configuration_zoedepth.py +17 -40
  980. transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
  981. transformers/models/zoedepth/modeling_zoedepth.py +5 -3
  982. transformers/pipelines/__init__.py +1 -61
  983. transformers/pipelines/any_to_any.py +1 -1
  984. transformers/pipelines/automatic_speech_recognition.py +0 -2
  985. transformers/pipelines/base.py +1 -1
  986. transformers/pipelines/image_text_to_text.py +1 -1
  987. transformers/pipelines/text_to_audio.py +5 -1
  988. transformers/processing_utils.py +35 -44
  989. transformers/pytorch_utils.py +2 -26
  990. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  991. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  992. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  993. transformers/quantizers/quantizer_mxfp4.py +1 -1
  994. transformers/quantizers/quantizer_torchao.py +0 -16
  995. transformers/safetensors_conversion.py +11 -4
  996. transformers/testing_utils.py +3 -28
  997. transformers/tokenization_mistral_common.py +9 -0
  998. transformers/tokenization_python.py +6 -4
  999. transformers/tokenization_utils_base.py +119 -219
  1000. transformers/tokenization_utils_tokenizers.py +31 -2
  1001. transformers/trainer.py +25 -33
  1002. transformers/trainer_seq2seq.py +1 -1
  1003. transformers/training_args.py +411 -417
  1004. transformers/utils/__init__.py +1 -4
  1005. transformers/utils/auto_docstring.py +15 -18
  1006. transformers/utils/backbone_utils.py +13 -373
  1007. transformers/utils/doc.py +4 -36
  1008. transformers/utils/generic.py +69 -33
  1009. transformers/utils/import_utils.py +72 -75
  1010. transformers/utils/loading_report.py +133 -105
  1011. transformers/utils/quantization_config.py +0 -21
  1012. transformers/video_processing_utils.py +5 -5
  1013. transformers/video_utils.py +3 -1
  1014. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
  1015. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
  1016. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1017. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1018. transformers/pipelines/image_to_text.py +0 -189
  1019. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1020. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1021. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -31,6 +31,8 @@ from ...modeling_outputs import (
31
31
  BaseModelOutputWithPastAndCrossAttentions,
32
32
  BaseModelOutputWithPooling,
33
33
  BaseModelOutputWithPoolingAndCrossAttentions,
34
+ CausalLMOutputWithPast,
35
+ Seq2SeqLMOutput,
34
36
  )
35
37
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
36
38
  from ...processing_utils import Unpack
@@ -44,6 +46,20 @@ from .configuration_instructblip import InstructBlipConfig, InstructBlipQFormerC
44
46
  logger = logging.get_logger(__name__)
45
47
 
46
48
 
49
+ @dataclass
50
+ @auto_docstring
51
+ class BaseModelOutputWithVisionQformerOutputs(BaseModelOutputWithPooling):
52
+ r"""
53
+ vision_outputs (`BaseModelOutputWithPooling`):
54
+ Outputs of the vision encoder.
55
+ qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
56
+ Outputs of the Q-Former (Querying Transformer).
57
+ """
58
+
59
+ vision_outputs: BaseModelOutputWithPooling | None = None
60
+ qformer_outputs: BaseModelOutputWithPoolingAndCrossAttentions | None = None
61
+
62
+
47
63
  @dataclass
48
64
  @auto_docstring(
49
65
  custom_intro="""
@@ -67,9 +83,9 @@ class InstructBlipForConditionalGenerationModelOutput(ModelOutput):
67
83
 
68
84
  loss: tuple[torch.FloatTensor] | None = None
69
85
  logits: tuple[torch.FloatTensor] | None = None
70
- vision_outputs: torch.FloatTensor | None = None
71
- qformer_outputs: tuple[torch.FloatTensor] | None = None
72
- language_model_outputs: tuple[torch.FloatTensor] | None = None
86
+ vision_outputs: BaseModelOutputWithPooling | None = None
87
+ qformer_outputs: BaseModelOutputWithPoolingAndCrossAttentions | None = None
88
+ language_model_outputs: CausalLMOutputWithPast | Seq2SeqLMOutput | None = None
73
89
 
74
90
  def to_tuple(self) -> tuple[Any]:
75
91
  return tuple(
@@ -233,10 +249,9 @@ class InstructBlipAttention(nn.Module):
233
249
  )
234
250
  query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
235
251
 
236
- attention_interface: Callable = eager_attention_forward
237
-
238
- if self.config._attn_implementation != "eager":
239
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
252
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
253
+ self.config._attn_implementation, eager_attention_forward
254
+ )
240
255
 
241
256
  attn_output, attn_weights = attention_interface(
242
257
  self,
@@ -371,7 +386,6 @@ class InstructBlipEncoder(nn.Module):
371
386
  return BaseModelOutput(last_hidden_state=hidden_states)
372
387
 
373
388
 
374
- # Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->InstructBlip, BLIP->INSTRUCTBLIP
375
389
  class InstructBlipVisionModel(InstructBlipPreTrainedModel):
376
390
  main_input_name = "pixel_values"
377
391
  input_modalities = ("image",)
@@ -940,12 +954,6 @@ class InstructBlipModel(InstructBlipPreTrainedModel):
940
954
  self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
941
955
  self.language_model = AutoModel.from_config(config.text_config)
942
956
 
943
- if self.language_model._no_split_modules is not None:
944
- self._no_split_modules.extend(self.language_model._no_split_modules)
945
-
946
- if self.language_model._keep_in_fp32_modules is not None:
947
- self._keep_in_fp32_modules.extend(self.language_model._keep_in_fp32_modules)
948
-
949
957
  # Initialize weights and apply final processing
950
958
  self.post_init()
951
959
 
@@ -1120,12 +1128,6 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
1120
1128
  else:
1121
1129
  language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
1122
1130
 
1123
- if language_model._no_split_modules is not None:
1124
- self._no_split_modules.extend(language_model._no_split_modules)
1125
-
1126
- if language_model._keep_in_fp32_modules is not None:
1127
- self._keep_in_fp32_modules.extend(language_model._keep_in_fp32_modules)
1128
-
1129
1131
  self.language_model = language_model
1130
1132
 
1131
1133
  # Initialize weights and apply final processing
@@ -1173,28 +1175,44 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
1173
1175
  if hasattr(self.language_model, "_hf_hook"):
1174
1176
  self.language_model._hf_hook.io_same_device = True # For `generate` compatibility
1175
1177
 
1178
+ @can_return_tuple
1179
+ @auto_docstring
1176
1180
  def get_image_features(
1177
1181
  self,
1178
1182
  pixel_values: torch.FloatTensor,
1179
1183
  qformer_input_ids: torch.LongTensor,
1180
1184
  qformer_attention_mask: torch.LongTensor | None = None,
1181
1185
  interpolate_pos_encoding: bool | None = False,
1182
- return_dict: bool | None = False,
1183
- ):
1184
- """
1185
- Encodes images into continuous embeddings that can be forwarded to the language model.
1186
+ **kwargs: Unpack[TransformersKwargs],
1187
+ ) -> tuple | BaseModelOutputWithVisionQformerOutputs:
1188
+ r"""
1189
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1190
+ The tensors corresponding to the input images.
1191
+ qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1192
+ Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
1193
+ to serve as text prompt, which the Q-Former model will encode.
1186
1194
 
1187
- Args:
1188
- pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1189
- The tensors corresponding to the input images.
1195
+ Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
1196
+ details.
1197
+
1198
+ [What are input IDs?](../glossary#input-ids)
1199
+ qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1200
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1201
+
1202
+ - 1 for tokens that are **not masked**,
1203
+ - 0 for tokens that are **masked**.
1204
+
1205
+ [What are attention masks?](../glossary#attention-mask)
1190
1206
  """
1191
1207
  # step 1: forward the images through the vision encoder,
1192
1208
  # to get image embeddings of shape (batch_size, seq_len, hidden_size)
1193
- vision_outputs = self.vision_model(
1209
+ vision_outputs: BaseModelOutputWithPooling = self.vision_model(
1194
1210
  pixel_values=pixel_values,
1195
1211
  interpolate_pos_encoding=interpolate_pos_encoding,
1196
1212
  return_dict=True,
1213
+ **kwargs,
1197
1214
  )
1215
+ vision_outputs = BaseModelOutputWithVisionQformerOutputs(**vision_outputs, vision_outputs=vision_outputs)
1198
1216
  image_embeds = vision_outputs[0]
1199
1217
 
1200
1218
  # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
@@ -1206,21 +1224,23 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
1206
1224
  if qformer_attention_mask is None:
1207
1225
  qformer_attention_mask = torch.ones_like(qformer_input_ids)
1208
1226
  qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
1209
- query_outputs = self.qformer(
1227
+ qformer_outputs = self.qformer(
1210
1228
  input_ids=qformer_input_ids,
1211
1229
  attention_mask=qformer_attention_mask,
1212
1230
  query_embeds=query_tokens,
1213
1231
  encoder_hidden_states=image_embeds,
1214
1232
  encoder_attention_mask=image_attention_mask,
1215
1233
  return_dict=True,
1234
+ **kwargs,
1216
1235
  )
1217
- query_output = query_outputs[0][:, : query_tokens.size(1), :]
1236
+ vision_outputs.qformer_outputs = qformer_outputs
1237
+ query_output = qformer_outputs[0][:, : query_tokens.size(1), :]
1218
1238
 
1219
1239
  # step 3: use the language model, conditioned on the query outputs and the prompt
1220
- language_model_inputs = self.language_projection(query_output)
1221
- if return_dict:
1222
- return language_model_inputs, vision_outputs, query_outputs
1223
- return language_model_inputs
1240
+ image_features = self.language_projection(query_output)
1241
+ vision_outputs.pooler_output = image_features
1242
+
1243
+ return vision_outputs
1224
1244
 
1225
1245
  def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor):
1226
1246
  """
@@ -1285,7 +1305,8 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
1285
1305
  >>> from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
1286
1306
  >>> import torch
1287
1307
  >>> from PIL import Image
1288
- >>> import requests
1308
+ >>> import httpx
1309
+ >>> from io import BytesIO
1289
1310
 
1290
1311
  >>> model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
1291
1312
  >>> processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
@@ -1294,7 +1315,8 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
1294
1315
  >>> model.to(device) # doctest: +IGNORE_RESULT
1295
1316
 
1296
1317
  >>> url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
1297
- >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
1318
+ >>> with httpx.stream("GET", url) as response:
1319
+ ... image = Image.open(BytesIO(response.read())).convert("RGB")
1298
1320
  >>> prompt = "What is unusual about this image?"
1299
1321
  >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
1300
1322
 
@@ -1314,13 +1336,16 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
1314
1336
  The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV, which is parked in the middle of a busy city street. This is an unconventional approach to ironing clothes, as it requires the man to balance himself and his ironing equipment on top of the vehicle while navigating through traffic. Additionally, the presence of taxis and other vehicles in the scene further emphasizes the unusual nature of this situation.
1315
1337
  ```"""
1316
1338
 
1317
- language_model_inputs, vision_outputs, query_outputs = self.get_image_features(
1339
+ image_features: BaseModelOutputWithVisionQformerOutputs = self.get_image_features(
1318
1340
  pixel_values,
1319
1341
  qformer_input_ids=qformer_input_ids,
1320
1342
  qformer_attention_mask=qformer_attention_mask,
1321
1343
  interpolate_pos_encoding=interpolate_pos_encoding,
1322
1344
  return_dict=True,
1323
1345
  )
1346
+ language_model_inputs = image_features.pooler_output
1347
+ qformer_outputs = image_features.qformer_outputs
1348
+ vision_outputs = image_features.vision_outputs
1324
1349
 
1325
1350
  if inputs_embeds is None:
1326
1351
  inputs_embeds = self.get_input_embeddings()(input_ids)
@@ -1362,7 +1387,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
1362
1387
  loss=loss,
1363
1388
  logits=logits,
1364
1389
  vision_outputs=vision_outputs,
1365
- qformer_outputs=query_outputs,
1390
+ qformer_outputs=qformer_outputs,
1366
1391
  language_model_outputs=outputs,
1367
1392
  )
1368
1393
 
@@ -1405,13 +1430,14 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
1405
1430
  self._preprocess_accelerate()
1406
1431
 
1407
1432
  batch_size = pixel_values.shape[0]
1408
- language_model_inputs, vision_outputs, query_outputs = self.get_image_features(
1433
+ image_features: BaseModelOutputWithVisionQformerOutputs = self.get_image_features(
1409
1434
  pixel_values,
1410
1435
  qformer_input_ids=qformer_input_ids,
1411
1436
  qformer_attention_mask=qformer_attention_mask,
1412
1437
  interpolate_pos_encoding=interpolate_pos_encoding,
1413
1438
  return_dict=True,
1414
1439
  )
1440
+ language_model_inputs = image_features.pooler_output
1415
1441
 
1416
1442
  if inputs_embeds is None:
1417
1443
  if input_ids is None:
@@ -191,7 +191,8 @@ class InstructBlipVideoQFormerConfig(PreTrainedConfig):
191
191
  encoder_hidden_size=1408,
192
192
  **kwargs,
193
193
  ):
194
- super().__init__(pad_token_id=pad_token_id, **kwargs)
194
+ super().__init__(**kwargs)
195
+ self.pad_token_id = pad_token_id
195
196
 
196
197
  self.vocab_size = vocab_size
197
198
  self.hidden_size = hidden_size
@@ -37,6 +37,8 @@ from ...modeling_outputs import (
37
37
  BaseModelOutputWithPastAndCrossAttentions,
38
38
  BaseModelOutputWithPooling,
39
39
  BaseModelOutputWithPoolingAndCrossAttentions,
40
+ CausalLMOutputWithPast,
41
+ Seq2SeqLMOutput,
40
42
  )
41
43
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
42
44
  from ...processing_utils import Unpack
@@ -289,10 +291,9 @@ class InstructBlipVideoAttention(nn.Module):
289
291
  )
290
292
  query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
291
293
 
292
- attention_interface: Callable = eager_attention_forward
293
-
294
- if self.config._attn_implementation != "eager":
295
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
294
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
295
+ self.config._attn_implementation, eager_attention_forward
296
+ )
296
297
 
297
298
  attn_output, attn_weights = attention_interface(
298
299
  self,
@@ -906,9 +907,9 @@ class InstructBlipVideoForConditionalGenerationModelOutput(ModelOutput):
906
907
 
907
908
  loss: tuple[torch.FloatTensor] | None = None
908
909
  logits: tuple[torch.FloatTensor] | None = None
909
- vision_outputs: torch.FloatTensor | None = None
910
- qformer_outputs: tuple[torch.FloatTensor] | None = None
911
- language_model_outputs: tuple[torch.FloatTensor] | None = None
910
+ vision_outputs: BaseModelOutputWithPooling | None = None
911
+ qformer_outputs: BaseModelOutputWithPoolingAndCrossAttentions | None = None
912
+ language_model_outputs: CausalLMOutputWithPast | Seq2SeqLMOutput | None = None
912
913
 
913
914
  def to_tuple(self) -> tuple[Any]:
914
915
  return tuple(
@@ -938,12 +939,6 @@ class InstructBlipVideoModel(InstructBlipVideoPreTrainedModel):
938
939
  self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
939
940
  self.language_model = AutoModel.from_config(config.text_config)
940
941
 
941
- if self.language_model._no_split_modules is not None:
942
- self._no_split_modules.extend(self.language_model._no_split_modules)
943
-
944
- if self.language_model._keep_in_fp32_modules is not None:
945
- self._keep_in_fp32_modules.extend(self.language_model._keep_in_fp32_modules)
946
-
947
942
  # Initialize weights and apply final processing
948
943
  self.post_init()
949
944
 
@@ -1120,6 +1115,20 @@ class InstructBlipVideoModel(InstructBlipVideoPreTrainedModel):
1120
1115
  )
1121
1116
 
1122
1117
 
1118
+ @dataclass
1119
+ @auto_docstring
1120
+ class BaseModelOutputWithVisionQformerOutputs(BaseModelOutputWithPooling):
1121
+ r"""
1122
+ vision_outputs (`BaseModelOutputWithPooling`):
1123
+ Outputs of the vision encoder.
1124
+ qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
1125
+ Outputs of the Q-Former (Querying Transformer).
1126
+ """
1127
+
1128
+ vision_outputs: BaseModelOutputWithPooling | None = None
1129
+ qformer_outputs: BaseModelOutputWithPoolingAndCrossAttentions | None = None
1130
+
1131
+
1123
1132
  @auto_docstring(
1124
1133
  custom_intro="""
1125
1134
  InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision
@@ -1151,12 +1160,6 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
1151
1160
  else:
1152
1161
  language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
1153
1162
 
1154
- if language_model._no_split_modules is not None:
1155
- self._no_split_modules.extend(language_model._no_split_modules)
1156
-
1157
- if language_model._keep_in_fp32_modules is not None:
1158
- self._keep_in_fp32_modules.extend(language_model._keep_in_fp32_modules)
1159
-
1160
1163
  self.language_model = language_model
1161
1164
 
1162
1165
  # Initialize weights and apply final processing
@@ -1203,23 +1206,6 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
1203
1206
  if hasattr(self.language_model, "_hf_hook"):
1204
1207
  self.language_model._hf_hook.io_same_device = True # For `generate` compatibility
1205
1208
 
1206
- def get_image_features(
1207
- self,
1208
- pixel_values: torch.FloatTensor,
1209
- qformer_input_ids: torch.LongTensor,
1210
- qformer_attention_mask: torch.LongTensor | None = None,
1211
- interpolate_pos_encoding: bool | None = False,
1212
- return_dict: bool | None = False,
1213
- ):
1214
- """
1215
- Encodes images into continuous embeddings that can be forwarded to the language model.
1216
-
1217
- Args:
1218
- pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1219
- The tensors corresponding to the input images.
1220
- """
1221
- pass
1222
-
1223
1209
  def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor):
1224
1210
  """
1225
1211
  Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
@@ -1320,15 +1306,17 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
1320
1306
  ```"""
1321
1307
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1322
1308
 
1323
- language_model_inputs, vision_outputs, query_outputs = self.get_video_features(
1309
+ video_features: BaseModelOutputWithVisionQformerOutputs = self.get_video_features(
1324
1310
  pixel_values,
1325
1311
  qformer_input_ids=qformer_input_ids,
1326
1312
  qformer_attention_mask=qformer_attention_mask,
1327
1313
  interpolate_pos_encoding=interpolate_pos_encoding,
1328
1314
  return_dict=True,
1315
+ **kwargs,
1329
1316
  )
1330
- vision_outputs = vision_outputs.to_tuple() if not return_dict else vision_outputs
1331
- query_outputs = query_outputs.to_tuple() if not return_dict else query_outputs
1317
+ language_model_inputs = video_features.pooler_output
1318
+ qformer_outputs = video_features.qformer_outputs
1319
+ vision_outputs = video_features.vision_outputs
1332
1320
 
1333
1321
  if inputs_embeds is None:
1334
1322
  inputs_embeds = self.get_input_embeddings()(input_ids)
@@ -1377,7 +1365,7 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
1377
1365
  loss=loss,
1378
1366
  logits=logits,
1379
1367
  vision_outputs=vision_outputs,
1380
- qformer_outputs=query_outputs,
1368
+ qformer_outputs=qformer_outputs,
1381
1369
  language_model_outputs=outputs,
1382
1370
  )
1383
1371
 
@@ -1420,13 +1408,14 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
1420
1408
  self._preprocess_accelerate()
1421
1409
 
1422
1410
  batch_size = pixel_values.shape[0]
1423
- language_model_inputs, vision_outputs, query_outputs = self.get_video_features(
1411
+ video_features: BaseModelOutputWithVisionQformerOutputs = self.get_video_features(
1424
1412
  pixel_values,
1425
1413
  qformer_input_ids=qformer_input_ids,
1426
1414
  qformer_attention_mask=qformer_attention_mask,
1427
1415
  interpolate_pos_encoding=interpolate_pos_encoding,
1428
1416
  return_dict=True,
1429
1417
  )
1418
+ language_model_inputs = video_features.pooler_output
1430
1419
 
1431
1420
  if inputs_embeds is None:
1432
1421
  if input_ids is None:
@@ -1451,30 +1440,42 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
1451
1440
 
1452
1441
  return outputs
1453
1442
 
1443
+ @can_return_tuple
1444
+ @auto_docstring
1454
1445
  def get_video_features(
1455
1446
  self,
1456
1447
  pixel_values: torch.FloatTensor,
1457
1448
  qformer_input_ids: torch.LongTensor,
1458
1449
  qformer_attention_mask: torch.LongTensor | None = None,
1459
1450
  interpolate_pos_encoding: bool | None = False,
1460
- return_dict: bool | None = False,
1461
- ):
1462
- """
1463
- Encodes images into continuous embeddings that can be forwarded to the language model.
1464
-
1465
- Args:
1466
- pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1467
- The tensors corresponding to the input images.
1451
+ **kwargs: Unpack[TransformersKwargs],
1452
+ ) -> tuple | BaseModelOutputWithVisionQformerOutputs:
1453
+ r"""
1454
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1455
+ The tensors corresponding to the input images.
1456
+ qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
1457
+ The sequence used as a prompt to be fed to the Q-Former module.
1458
+ qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
1459
+ Mask to avoid performing attention on padding token indices.
1468
1460
  """
1469
1461
  # step 1: forward the images through the vision encoder,
1470
1462
  # we process in a batched way, later unbatch it back (video has frames=4 always)
1471
1463
  batch_size, frames, channel, height, width = pixel_values.shape
1472
1464
  pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width)
1473
1465
 
1474
- vision_outputs = self.vision_model(
1466
+ vision_outputs: BaseModelOutputWithPooling = self.vision_model(
1475
1467
  pixel_values=pixel_values,
1476
1468
  interpolate_pos_encoding=interpolate_pos_encoding,
1477
1469
  return_dict=True,
1470
+ **kwargs,
1471
+ )
1472
+ vision_outputs = BaseModelOutputWithVisionQformerOutputs(
1473
+ last_hidden_state=vision_outputs.last_hidden_state,
1474
+ pooler_output=vision_outputs.pooler_output,
1475
+ hidden_states=vision_outputs.hidden_states,
1476
+ attentions=vision_outputs.attentions,
1477
+ vision_outputs=vision_outputs,
1478
+ qformer_outputs=None,
1478
1479
  )
1479
1480
  image_embeds = vision_outputs[0]
1480
1481
 
@@ -1491,24 +1492,26 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
1491
1492
  qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0)
1492
1493
  qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0)
1493
1494
  qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
1494
- query_outputs = self.qformer(
1495
+ qformer_outputs = self.qformer(
1495
1496
  input_ids=qformer_input_ids,
1496
1497
  attention_mask=qformer_attention_mask,
1497
1498
  query_embeds=query_tokens,
1498
1499
  encoder_hidden_states=image_embeds,
1499
1500
  encoder_attention_mask=image_attention_mask,
1500
1501
  return_dict=True,
1502
+ **kwargs,
1501
1503
  )
1502
- query_output = query_outputs[0][:, : query_tokens.size(1), :]
1504
+ vision_outputs.qformer_outputs = qformer_outputs
1505
+ query_output = qformer_outputs[0][:, : query_tokens.size(1), :]
1503
1506
 
1504
1507
  # step 3: use the language model, conditioned on the query outputs and the prompt
1505
- language_model_inputs = self.language_projection(query_output)
1508
+ video_features = self.language_projection(query_output)
1506
1509
 
1507
1510
  # unbatch inputs back, each video-frame gets `num_query_tokens` seq length
1508
- language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1)
1509
- if return_dict:
1510
- return language_model_inputs, vision_outputs, query_outputs
1511
- return language_model_inputs
1511
+ video_features = video_features.reshape(batch_size, self.config.num_query_tokens * frames, -1)
1512
+ vision_outputs.pooler_output = video_features
1513
+
1514
+ return vision_outputs
1512
1515
 
1513
1516
 
1514
1517
  __all__ = [
@@ -20,6 +20,7 @@ from transformers.models.instructblip.configuration_instructblip import (
20
20
  InstructBlipVisionConfig,
21
21
  )
22
22
  from transformers.models.instructblip.modeling_instructblip import (
23
+ BaseModelOutputWithVisionQformerOutputs,
23
24
  InstructBlipForConditionalGeneration,
24
25
  InstructBlipForConditionalGenerationModelOutput,
25
26
  InstructBlipModel,
@@ -31,9 +32,10 @@ from transformers.models.instructblip.modeling_instructblip import (
31
32
 
32
33
  from ...configuration_utils import PreTrainedConfig
33
34
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
35
+ from ...modeling_outputs import BaseModelOutputWithPooling
34
36
  from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
35
37
  from ...processing_utils import Unpack
36
- from ...utils import logging
38
+ from ...utils import auto_docstring, can_return_tuple, logging
37
39
  from ..auto import CONFIG_MAPPING, AutoConfig
38
40
 
39
41
 
@@ -283,30 +285,42 @@ class InstructBlipVideoModel(InstructBlipModel):
283
285
 
284
286
 
285
287
  class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGeneration):
288
+ @can_return_tuple
289
+ @auto_docstring
286
290
  def get_video_features(
287
291
  self,
288
292
  pixel_values: torch.FloatTensor,
289
293
  qformer_input_ids: torch.LongTensor,
290
294
  qformer_attention_mask: torch.LongTensor | None = None,
291
295
  interpolate_pos_encoding: bool | None = False,
292
- return_dict: bool | None = False,
293
- ):
294
- """
295
- Encodes images into continuous embeddings that can be forwarded to the language model.
296
-
297
- Args:
298
- pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
299
- The tensors corresponding to the input images.
296
+ **kwargs: Unpack[TransformersKwargs],
297
+ ) -> tuple | BaseModelOutputWithVisionQformerOutputs:
298
+ r"""
299
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
300
+ The tensors corresponding to the input images.
301
+ qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
302
+ The sequence used as a prompt to be fed to the Q-Former module.
303
+ qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
304
+ Mask to avoid performing attention on padding token indices.
300
305
  """
301
306
  # step 1: forward the images through the vision encoder,
302
307
  # we process in a batched way, later unbatch it back (video has frames=4 always)
303
308
  batch_size, frames, channel, height, width = pixel_values.shape
304
309
  pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width)
305
310
 
306
- vision_outputs = self.vision_model(
311
+ vision_outputs: BaseModelOutputWithPooling = self.vision_model(
307
312
  pixel_values=pixel_values,
308
313
  interpolate_pos_encoding=interpolate_pos_encoding,
309
314
  return_dict=True,
315
+ **kwargs,
316
+ )
317
+ vision_outputs = BaseModelOutputWithVisionQformerOutputs(
318
+ last_hidden_state=vision_outputs.last_hidden_state,
319
+ pooler_output=vision_outputs.pooler_output,
320
+ hidden_states=vision_outputs.hidden_states,
321
+ attentions=vision_outputs.attentions,
322
+ vision_outputs=vision_outputs,
323
+ qformer_outputs=None,
310
324
  )
311
325
  image_embeds = vision_outputs[0]
312
326
 
@@ -323,35 +337,29 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGenera
323
337
  qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0)
324
338
  qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0)
325
339
  qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1)
326
- query_outputs = self.qformer(
340
+ qformer_outputs = self.qformer(
327
341
  input_ids=qformer_input_ids,
328
342
  attention_mask=qformer_attention_mask,
329
343
  query_embeds=query_tokens,
330
344
  encoder_hidden_states=image_embeds,
331
345
  encoder_attention_mask=image_attention_mask,
332
346
  return_dict=True,
347
+ **kwargs,
333
348
  )
334
- query_output = query_outputs[0][:, : query_tokens.size(1), :]
349
+ vision_outputs.qformer_outputs = qformer_outputs
350
+ query_output = qformer_outputs[0][:, : query_tokens.size(1), :]
335
351
 
336
352
  # step 3: use the language model, conditioned on the query outputs and the prompt
337
- language_model_inputs = self.language_projection(query_output)
353
+ video_features = self.language_projection(query_output)
338
354
 
339
355
  # unbatch inputs back, each video-frame gets `num_query_tokens` seq length
340
- language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1)
341
- if return_dict:
342
- return language_model_inputs, vision_outputs, query_outputs
343
- return language_model_inputs
356
+ video_features = video_features.reshape(batch_size, self.config.num_query_tokens * frames, -1)
357
+ vision_outputs.pooler_output = video_features
344
358
 
345
- # Model supports only videos
346
- def get_image_features(
347
- self,
348
- pixel_values: torch.FloatTensor,
349
- qformer_input_ids: torch.LongTensor,
350
- qformer_attention_mask: torch.LongTensor | None = None,
351
- interpolate_pos_encoding: bool | None = False,
352
- return_dict: bool | None = False,
353
- ):
354
- pass
359
+ return vision_outputs
360
+
361
+ def get_image_features(**super_kwargs):
362
+ raise AttributeError("No need to inherit as this architecture only supports videos.")
355
363
 
356
364
  def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor):
357
365
  """
@@ -451,15 +459,17 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGenera
451
459
  ```"""
452
460
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
453
461
 
454
- language_model_inputs, vision_outputs, query_outputs = self.get_video_features(
462
+ video_features: BaseModelOutputWithVisionQformerOutputs = self.get_video_features(
455
463
  pixel_values,
456
464
  qformer_input_ids=qformer_input_ids,
457
465
  qformer_attention_mask=qformer_attention_mask,
458
466
  interpolate_pos_encoding=interpolate_pos_encoding,
459
467
  return_dict=True,
468
+ **kwargs,
460
469
  )
461
- vision_outputs = vision_outputs.to_tuple() if not return_dict else vision_outputs
462
- query_outputs = query_outputs.to_tuple() if not return_dict else query_outputs
470
+ language_model_inputs = video_features.pooler_output
471
+ qformer_outputs = video_features.qformer_outputs
472
+ vision_outputs = video_features.vision_outputs
463
473
 
464
474
  if inputs_embeds is None:
465
475
  inputs_embeds = self.get_input_embeddings()(input_ids)
@@ -508,7 +518,7 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGenera
508
518
  loss=loss,
509
519
  logits=logits,
510
520
  vision_outputs=vision_outputs,
511
- qformer_outputs=query_outputs,
521
+ qformer_outputs=qformer_outputs,
512
522
  language_model_outputs=outputs,
513
523
  )
514
524
 
@@ -551,13 +561,14 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGenera
551
561
  self._preprocess_accelerate()
552
562
 
553
563
  batch_size = pixel_values.shape[0]
554
- language_model_inputs, vision_outputs, query_outputs = self.get_video_features(
564
+ video_features: BaseModelOutputWithVisionQformerOutputs = self.get_video_features(
555
565
  pixel_values,
556
566
  qformer_input_ids=qformer_input_ids,
557
567
  qformer_attention_mask=qformer_attention_mask,
558
568
  interpolate_pos_encoding=interpolate_pos_encoding,
559
569
  return_dict=True,
560
570
  )
571
+ language_model_inputs = video_features.pooler_output
561
572
 
562
573
  if inputs_embeds is None:
563
574
  if input_ids is None:
@@ -19,7 +19,7 @@ Video processor class for InstructBLIPVideo
19
19
  from typing import Optional
20
20
 
21
21
  import torch
22
- from torchvision.transforms.v2 import functional as F
22
+ import torchvision.transforms.v2.functional as tvF
23
23
 
24
24
  from ...image_processing_utils import BatchFeature
25
25
  from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
@@ -47,7 +47,7 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
47
47
  do_convert_rgb: bool,
48
48
  do_resize: bool,
49
49
  size: SizeDict,
50
- interpolation: Optional["F.InterpolationMode"],
50
+ interpolation: Optional["tvF.InterpolationMode"],
51
51
  do_center_crop: bool,
52
52
  crop_size: SizeDict,
53
53
  do_rescale: bool,