transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1021) hide show
  1. transformers/__init__.py +4 -11
  2. transformers/activations.py +2 -2
  3. transformers/backbone_utils.py +326 -0
  4. transformers/cache_utils.py +11 -2
  5. transformers/cli/serve.py +11 -8
  6. transformers/configuration_utils.py +1 -69
  7. transformers/conversion_mapping.py +146 -26
  8. transformers/convert_slow_tokenizer.py +6 -4
  9. transformers/core_model_loading.py +207 -118
  10. transformers/dependency_versions_check.py +0 -1
  11. transformers/dependency_versions_table.py +7 -8
  12. transformers/file_utils.py +0 -2
  13. transformers/generation/candidate_generator.py +1 -2
  14. transformers/generation/continuous_batching/cache.py +40 -38
  15. transformers/generation/continuous_batching/cache_manager.py +3 -16
  16. transformers/generation/continuous_batching/continuous_api.py +94 -406
  17. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  18. transformers/generation/continuous_batching/requests.py +54 -17
  19. transformers/generation/continuous_batching/scheduler.py +77 -95
  20. transformers/generation/logits_process.py +10 -5
  21. transformers/generation/stopping_criteria.py +1 -2
  22. transformers/generation/utils.py +75 -95
  23. transformers/image_processing_utils.py +0 -3
  24. transformers/image_processing_utils_fast.py +17 -18
  25. transformers/image_transforms.py +44 -13
  26. transformers/image_utils.py +0 -5
  27. transformers/initialization.py +57 -0
  28. transformers/integrations/__init__.py +10 -24
  29. transformers/integrations/accelerate.py +47 -11
  30. transformers/integrations/deepspeed.py +145 -3
  31. transformers/integrations/executorch.py +2 -6
  32. transformers/integrations/finegrained_fp8.py +142 -7
  33. transformers/integrations/flash_attention.py +2 -7
  34. transformers/integrations/hub_kernels.py +18 -7
  35. transformers/integrations/moe.py +226 -106
  36. transformers/integrations/mxfp4.py +47 -34
  37. transformers/integrations/peft.py +488 -176
  38. transformers/integrations/tensor_parallel.py +641 -581
  39. transformers/masking_utils.py +153 -9
  40. transformers/modeling_flash_attention_utils.py +1 -2
  41. transformers/modeling_utils.py +359 -358
  42. transformers/models/__init__.py +6 -0
  43. transformers/models/afmoe/configuration_afmoe.py +14 -4
  44. transformers/models/afmoe/modeling_afmoe.py +8 -8
  45. transformers/models/afmoe/modular_afmoe.py +7 -7
  46. transformers/models/aimv2/configuration_aimv2.py +2 -7
  47. transformers/models/aimv2/modeling_aimv2.py +26 -24
  48. transformers/models/aimv2/modular_aimv2.py +8 -12
  49. transformers/models/albert/configuration_albert.py +8 -1
  50. transformers/models/albert/modeling_albert.py +3 -3
  51. transformers/models/align/configuration_align.py +8 -5
  52. transformers/models/align/modeling_align.py +22 -24
  53. transformers/models/altclip/configuration_altclip.py +4 -6
  54. transformers/models/altclip/modeling_altclip.py +30 -26
  55. transformers/models/apertus/configuration_apertus.py +5 -7
  56. transformers/models/apertus/modeling_apertus.py +4 -4
  57. transformers/models/apertus/modular_apertus.py +8 -10
  58. transformers/models/arcee/configuration_arcee.py +5 -7
  59. transformers/models/arcee/modeling_arcee.py +4 -4
  60. transformers/models/aria/configuration_aria.py +11 -21
  61. transformers/models/aria/modeling_aria.py +39 -36
  62. transformers/models/aria/modular_aria.py +33 -39
  63. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
  64. transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
  65. transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
  66. transformers/models/auto/auto_factory.py +8 -6
  67. transformers/models/auto/configuration_auto.py +22 -0
  68. transformers/models/auto/image_processing_auto.py +17 -13
  69. transformers/models/auto/modeling_auto.py +15 -0
  70. transformers/models/auto/processing_auto.py +9 -18
  71. transformers/models/auto/tokenization_auto.py +17 -15
  72. transformers/models/autoformer/modeling_autoformer.py +2 -1
  73. transformers/models/aya_vision/configuration_aya_vision.py +4 -0
  74. transformers/models/aya_vision/modeling_aya_vision.py +29 -62
  75. transformers/models/aya_vision/modular_aya_vision.py +20 -45
  76. transformers/models/bamba/configuration_bamba.py +17 -7
  77. transformers/models/bamba/modeling_bamba.py +23 -55
  78. transformers/models/bamba/modular_bamba.py +19 -54
  79. transformers/models/bark/configuration_bark.py +2 -1
  80. transformers/models/bark/modeling_bark.py +24 -10
  81. transformers/models/bart/configuration_bart.py +9 -4
  82. transformers/models/bart/modeling_bart.py +9 -12
  83. transformers/models/beit/configuration_beit.py +2 -4
  84. transformers/models/beit/image_processing_beit_fast.py +3 -3
  85. transformers/models/beit/modeling_beit.py +14 -9
  86. transformers/models/bert/configuration_bert.py +12 -1
  87. transformers/models/bert/modeling_bert.py +6 -30
  88. transformers/models/bert_generation/configuration_bert_generation.py +17 -1
  89. transformers/models/bert_generation/modeling_bert_generation.py +6 -6
  90. transformers/models/big_bird/configuration_big_bird.py +12 -8
  91. transformers/models/big_bird/modeling_big_bird.py +0 -15
  92. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
  93. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
  94. transformers/models/biogpt/configuration_biogpt.py +8 -1
  95. transformers/models/biogpt/modeling_biogpt.py +4 -8
  96. transformers/models/biogpt/modular_biogpt.py +1 -5
  97. transformers/models/bit/configuration_bit.py +2 -4
  98. transformers/models/bit/modeling_bit.py +6 -5
  99. transformers/models/bitnet/configuration_bitnet.py +5 -7
  100. transformers/models/bitnet/modeling_bitnet.py +3 -4
  101. transformers/models/bitnet/modular_bitnet.py +3 -4
  102. transformers/models/blenderbot/configuration_blenderbot.py +8 -4
  103. transformers/models/blenderbot/modeling_blenderbot.py +4 -4
  104. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
  105. transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
  106. transformers/models/blip/configuration_blip.py +9 -9
  107. transformers/models/blip/modeling_blip.py +55 -37
  108. transformers/models/blip_2/configuration_blip_2.py +2 -1
  109. transformers/models/blip_2/modeling_blip_2.py +81 -56
  110. transformers/models/bloom/configuration_bloom.py +5 -1
  111. transformers/models/bloom/modeling_bloom.py +2 -1
  112. transformers/models/blt/configuration_blt.py +23 -12
  113. transformers/models/blt/modeling_blt.py +20 -14
  114. transformers/models/blt/modular_blt.py +70 -10
  115. transformers/models/bridgetower/configuration_bridgetower.py +7 -1
  116. transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
  117. transformers/models/bridgetower/modeling_bridgetower.py +29 -15
  118. transformers/models/bros/configuration_bros.py +24 -17
  119. transformers/models/camembert/configuration_camembert.py +8 -1
  120. transformers/models/camembert/modeling_camembert.py +6 -6
  121. transformers/models/canine/configuration_canine.py +4 -1
  122. transformers/models/chameleon/configuration_chameleon.py +5 -7
  123. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
  124. transformers/models/chameleon/modeling_chameleon.py +82 -36
  125. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
  126. transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
  127. transformers/models/clap/configuration_clap.py +4 -8
  128. transformers/models/clap/modeling_clap.py +21 -22
  129. transformers/models/clip/configuration_clip.py +4 -1
  130. transformers/models/clip/image_processing_clip_fast.py +9 -0
  131. transformers/models/clip/modeling_clip.py +25 -22
  132. transformers/models/clipseg/configuration_clipseg.py +4 -1
  133. transformers/models/clipseg/modeling_clipseg.py +27 -25
  134. transformers/models/clipseg/processing_clipseg.py +11 -3
  135. transformers/models/clvp/configuration_clvp.py +14 -2
  136. transformers/models/clvp/modeling_clvp.py +19 -30
  137. transformers/models/codegen/configuration_codegen.py +4 -3
  138. transformers/models/codegen/modeling_codegen.py +2 -1
  139. transformers/models/cohere/configuration_cohere.py +5 -7
  140. transformers/models/cohere/modeling_cohere.py +4 -4
  141. transformers/models/cohere/modular_cohere.py +3 -3
  142. transformers/models/cohere2/configuration_cohere2.py +6 -8
  143. transformers/models/cohere2/modeling_cohere2.py +4 -4
  144. transformers/models/cohere2/modular_cohere2.py +9 -11
  145. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  146. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
  147. transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
  148. transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
  149. transformers/models/colqwen2/modeling_colqwen2.py +7 -6
  150. transformers/models/colqwen2/modular_colqwen2.py +7 -6
  151. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
  152. transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
  153. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
  154. transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
  155. transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
  156. transformers/models/convbert/configuration_convbert.py +11 -7
  157. transformers/models/convnext/configuration_convnext.py +2 -4
  158. transformers/models/convnext/image_processing_convnext_fast.py +2 -2
  159. transformers/models/convnext/modeling_convnext.py +7 -6
  160. transformers/models/convnextv2/configuration_convnextv2.py +2 -4
  161. transformers/models/convnextv2/modeling_convnextv2.py +7 -6
  162. transformers/models/cpmant/configuration_cpmant.py +4 -0
  163. transformers/models/csm/configuration_csm.py +9 -15
  164. transformers/models/csm/modeling_csm.py +3 -3
  165. transformers/models/ctrl/configuration_ctrl.py +16 -0
  166. transformers/models/ctrl/modeling_ctrl.py +13 -25
  167. transformers/models/cwm/configuration_cwm.py +5 -7
  168. transformers/models/cwm/modeling_cwm.py +4 -4
  169. transformers/models/d_fine/configuration_d_fine.py +10 -56
  170. transformers/models/d_fine/modeling_d_fine.py +728 -868
  171. transformers/models/d_fine/modular_d_fine.py +335 -412
  172. transformers/models/dab_detr/configuration_dab_detr.py +22 -48
  173. transformers/models/dab_detr/modeling_dab_detr.py +11 -7
  174. transformers/models/dac/modeling_dac.py +1 -1
  175. transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
  176. transformers/models/data2vec/configuration_data2vec_text.py +11 -2
  177. transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
  178. transformers/models/data2vec/modeling_data2vec_text.py +6 -6
  179. transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
  180. transformers/models/dbrx/configuration_dbrx.py +11 -3
  181. transformers/models/dbrx/modeling_dbrx.py +6 -6
  182. transformers/models/dbrx/modular_dbrx.py +6 -6
  183. transformers/models/deberta/configuration_deberta.py +6 -0
  184. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
  185. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
  186. transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
  187. transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
  188. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
  189. transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
  190. transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
  191. transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
  192. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
  193. transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
  194. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
  195. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
  196. transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
  197. transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
  198. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
  199. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
  200. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
  201. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
  202. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
  203. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
  204. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
  205. transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
  206. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
  207. transformers/models/deit/modeling_deit.py +11 -7
  208. transformers/models/depth_anything/configuration_depth_anything.py +12 -42
  209. transformers/models/depth_anything/modeling_depth_anything.py +5 -3
  210. transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
  211. transformers/models/depth_pro/modeling_depth_pro.py +8 -4
  212. transformers/models/detr/configuration_detr.py +18 -49
  213. transformers/models/detr/image_processing_detr_fast.py +11 -11
  214. transformers/models/detr/modeling_detr.py +695 -734
  215. transformers/models/dia/configuration_dia.py +4 -7
  216. transformers/models/dia/generation_dia.py +8 -17
  217. transformers/models/dia/modeling_dia.py +7 -7
  218. transformers/models/dia/modular_dia.py +4 -4
  219. transformers/models/diffllama/configuration_diffllama.py +5 -7
  220. transformers/models/diffllama/modeling_diffllama.py +3 -8
  221. transformers/models/diffllama/modular_diffllama.py +2 -7
  222. transformers/models/dinat/configuration_dinat.py +2 -4
  223. transformers/models/dinat/modeling_dinat.py +7 -6
  224. transformers/models/dinov2/configuration_dinov2.py +2 -4
  225. transformers/models/dinov2/modeling_dinov2.py +9 -8
  226. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
  227. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
  228. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
  229. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
  230. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
  231. transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
  232. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
  233. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
  234. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
  235. transformers/models/distilbert/configuration_distilbert.py +8 -1
  236. transformers/models/distilbert/modeling_distilbert.py +3 -3
  237. transformers/models/doge/configuration_doge.py +17 -7
  238. transformers/models/doge/modeling_doge.py +4 -4
  239. transformers/models/doge/modular_doge.py +20 -10
  240. transformers/models/donut/image_processing_donut_fast.py +4 -4
  241. transformers/models/dots1/configuration_dots1.py +16 -7
  242. transformers/models/dots1/modeling_dots1.py +4 -4
  243. transformers/models/dpr/configuration_dpr.py +19 -1
  244. transformers/models/dpt/configuration_dpt.py +23 -65
  245. transformers/models/dpt/image_processing_dpt_fast.py +5 -5
  246. transformers/models/dpt/modeling_dpt.py +19 -15
  247. transformers/models/dpt/modular_dpt.py +4 -4
  248. transformers/models/edgetam/configuration_edgetam.py +1 -1
  249. transformers/models/edgetam/modeling_edgetam.py +53 -53
  250. transformers/models/edgetam/modular_edgetam.py +5 -7
  251. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
  252. transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
  253. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
  254. transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
  255. transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
  256. transformers/models/electra/configuration_electra.py +13 -2
  257. transformers/models/electra/modeling_electra.py +6 -6
  258. transformers/models/emu3/configuration_emu3.py +12 -10
  259. transformers/models/emu3/modeling_emu3.py +84 -47
  260. transformers/models/emu3/modular_emu3.py +77 -39
  261. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
  262. transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
  263. transformers/models/eomt/configuration_eomt.py +12 -13
  264. transformers/models/eomt/image_processing_eomt_fast.py +3 -3
  265. transformers/models/eomt/modeling_eomt.py +3 -3
  266. transformers/models/eomt/modular_eomt.py +17 -17
  267. transformers/models/eomt_dinov3/__init__.py +28 -0
  268. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  269. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  270. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  271. transformers/models/ernie/configuration_ernie.py +24 -2
  272. transformers/models/ernie/modeling_ernie.py +6 -30
  273. transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
  274. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  275. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
  276. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
  277. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
  278. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
  279. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
  280. transformers/models/esm/configuration_esm.py +9 -11
  281. transformers/models/esm/modeling_esm.py +3 -3
  282. transformers/models/esm/modeling_esmfold.py +1 -6
  283. transformers/models/esm/openfold_utils/protein.py +2 -3
  284. transformers/models/evolla/configuration_evolla.py +21 -8
  285. transformers/models/evolla/modeling_evolla.py +11 -7
  286. transformers/models/evolla/modular_evolla.py +5 -1
  287. transformers/models/exaone4/configuration_exaone4.py +8 -5
  288. transformers/models/exaone4/modeling_exaone4.py +4 -4
  289. transformers/models/exaone4/modular_exaone4.py +11 -8
  290. transformers/models/exaone_moe/__init__.py +27 -0
  291. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  292. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  293. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  294. transformers/models/falcon/configuration_falcon.py +9 -1
  295. transformers/models/falcon/modeling_falcon.py +3 -8
  296. transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
  297. transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
  298. transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
  299. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
  300. transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
  301. transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
  302. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
  303. transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
  304. transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
  305. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
  306. transformers/models/flaubert/configuration_flaubert.py +10 -4
  307. transformers/models/flaubert/modeling_flaubert.py +1 -1
  308. transformers/models/flava/configuration_flava.py +4 -3
  309. transformers/models/flava/image_processing_flava_fast.py +4 -4
  310. transformers/models/flava/modeling_flava.py +36 -28
  311. transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
  312. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
  313. transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
  314. transformers/models/florence2/configuration_florence2.py +4 -0
  315. transformers/models/florence2/modeling_florence2.py +57 -32
  316. transformers/models/florence2/modular_florence2.py +48 -26
  317. transformers/models/fnet/configuration_fnet.py +6 -1
  318. transformers/models/focalnet/configuration_focalnet.py +2 -4
  319. transformers/models/focalnet/modeling_focalnet.py +10 -7
  320. transformers/models/fsmt/configuration_fsmt.py +12 -16
  321. transformers/models/funnel/configuration_funnel.py +8 -0
  322. transformers/models/fuyu/configuration_fuyu.py +5 -8
  323. transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
  324. transformers/models/fuyu/modeling_fuyu.py +24 -23
  325. transformers/models/gemma/configuration_gemma.py +5 -7
  326. transformers/models/gemma/modeling_gemma.py +4 -4
  327. transformers/models/gemma/modular_gemma.py +5 -7
  328. transformers/models/gemma2/configuration_gemma2.py +5 -7
  329. transformers/models/gemma2/modeling_gemma2.py +4 -4
  330. transformers/models/gemma2/modular_gemma2.py +8 -10
  331. transformers/models/gemma3/configuration_gemma3.py +28 -22
  332. transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
  333. transformers/models/gemma3/modeling_gemma3.py +37 -33
  334. transformers/models/gemma3/modular_gemma3.py +46 -42
  335. transformers/models/gemma3n/configuration_gemma3n.py +35 -22
  336. transformers/models/gemma3n/modeling_gemma3n.py +86 -58
  337. transformers/models/gemma3n/modular_gemma3n.py +112 -75
  338. transformers/models/git/configuration_git.py +5 -7
  339. transformers/models/git/modeling_git.py +31 -41
  340. transformers/models/glm/configuration_glm.py +7 -9
  341. transformers/models/glm/modeling_glm.py +4 -4
  342. transformers/models/glm4/configuration_glm4.py +7 -9
  343. transformers/models/glm4/modeling_glm4.py +4 -4
  344. transformers/models/glm46v/configuration_glm46v.py +4 -0
  345. transformers/models/glm46v/image_processing_glm46v.py +5 -2
  346. transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
  347. transformers/models/glm46v/modeling_glm46v.py +91 -46
  348. transformers/models/glm46v/modular_glm46v.py +4 -0
  349. transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
  350. transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
  351. transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
  352. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
  353. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
  354. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
  355. transformers/models/glm4v/configuration_glm4v.py +12 -8
  356. transformers/models/glm4v/image_processing_glm4v.py +5 -2
  357. transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
  358. transformers/models/glm4v/modeling_glm4v.py +120 -63
  359. transformers/models/glm4v/modular_glm4v.py +82 -50
  360. transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
  361. transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
  362. transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
  363. transformers/models/glm_image/configuration_glm_image.py +26 -20
  364. transformers/models/glm_image/image_processing_glm_image.py +1 -1
  365. transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
  366. transformers/models/glm_image/modeling_glm_image.py +337 -236
  367. transformers/models/glm_image/modular_glm_image.py +415 -255
  368. transformers/models/glm_image/processing_glm_image.py +65 -17
  369. transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
  370. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  371. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  372. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  373. transformers/models/glmasr/modeling_glmasr.py +34 -28
  374. transformers/models/glmasr/modular_glmasr.py +23 -11
  375. transformers/models/glpn/image_processing_glpn_fast.py +3 -3
  376. transformers/models/glpn/modeling_glpn.py +4 -2
  377. transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
  378. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
  379. transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
  380. transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
  381. transformers/models/gpt2/configuration_gpt2.py +13 -1
  382. transformers/models/gpt2/modeling_gpt2.py +5 -5
  383. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
  384. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
  385. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
  386. transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
  387. transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
  388. transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
  389. transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
  390. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
  391. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
  392. transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
  393. transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
  394. transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
  395. transformers/models/gptj/configuration_gptj.py +4 -4
  396. transformers/models/gptj/modeling_gptj.py +3 -7
  397. transformers/models/granite/configuration_granite.py +5 -7
  398. transformers/models/granite/modeling_granite.py +4 -4
  399. transformers/models/granite_speech/modeling_granite_speech.py +63 -37
  400. transformers/models/granitemoe/configuration_granitemoe.py +5 -7
  401. transformers/models/granitemoe/modeling_granitemoe.py +4 -4
  402. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
  403. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
  404. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
  405. transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
  406. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
  407. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
  408. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
  409. transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
  410. transformers/models/groupvit/configuration_groupvit.py +4 -1
  411. transformers/models/groupvit/modeling_groupvit.py +29 -22
  412. transformers/models/helium/configuration_helium.py +5 -7
  413. transformers/models/helium/modeling_helium.py +4 -4
  414. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
  415. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
  416. transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
  417. transformers/models/hiera/configuration_hiera.py +2 -4
  418. transformers/models/hiera/modeling_hiera.py +11 -8
  419. transformers/models/hubert/configuration_hubert.py +4 -1
  420. transformers/models/hubert/modeling_hubert.py +7 -4
  421. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
  422. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
  423. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
  424. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
  425. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
  426. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
  427. transformers/models/ibert/configuration_ibert.py +4 -1
  428. transformers/models/idefics/configuration_idefics.py +5 -7
  429. transformers/models/idefics/modeling_idefics.py +3 -4
  430. transformers/models/idefics/vision.py +5 -4
  431. transformers/models/idefics2/configuration_idefics2.py +1 -2
  432. transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
  433. transformers/models/idefics2/modeling_idefics2.py +72 -50
  434. transformers/models/idefics3/configuration_idefics3.py +1 -3
  435. transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
  436. transformers/models/idefics3/modeling_idefics3.py +63 -40
  437. transformers/models/ijepa/modeling_ijepa.py +3 -3
  438. transformers/models/imagegpt/configuration_imagegpt.py +9 -1
  439. transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
  440. transformers/models/imagegpt/modeling_imagegpt.py +8 -4
  441. transformers/models/informer/modeling_informer.py +3 -3
  442. transformers/models/instructblip/configuration_instructblip.py +2 -1
  443. transformers/models/instructblip/modeling_instructblip.py +65 -39
  444. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
  445. transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
  446. transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
  447. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
  448. transformers/models/internvl/configuration_internvl.py +5 -0
  449. transformers/models/internvl/modeling_internvl.py +35 -55
  450. transformers/models/internvl/modular_internvl.py +26 -38
  451. transformers/models/internvl/video_processing_internvl.py +2 -2
  452. transformers/models/jais2/configuration_jais2.py +5 -7
  453. transformers/models/jais2/modeling_jais2.py +4 -4
  454. transformers/models/jamba/configuration_jamba.py +5 -7
  455. transformers/models/jamba/modeling_jamba.py +4 -4
  456. transformers/models/jamba/modular_jamba.py +3 -3
  457. transformers/models/janus/image_processing_janus.py +2 -2
  458. transformers/models/janus/image_processing_janus_fast.py +8 -8
  459. transformers/models/janus/modeling_janus.py +63 -146
  460. transformers/models/janus/modular_janus.py +62 -20
  461. transformers/models/jetmoe/configuration_jetmoe.py +6 -4
  462. transformers/models/jetmoe/modeling_jetmoe.py +3 -3
  463. transformers/models/jetmoe/modular_jetmoe.py +3 -3
  464. transformers/models/kosmos2/configuration_kosmos2.py +10 -8
  465. transformers/models/kosmos2/modeling_kosmos2.py +56 -34
  466. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
  467. transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
  468. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
  469. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
  470. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
  471. transformers/models/lasr/configuration_lasr.py +2 -4
  472. transformers/models/lasr/modeling_lasr.py +3 -3
  473. transformers/models/lasr/modular_lasr.py +3 -3
  474. transformers/models/layoutlm/configuration_layoutlm.py +14 -1
  475. transformers/models/layoutlm/modeling_layoutlm.py +3 -3
  476. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
  477. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
  478. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
  479. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
  480. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
  481. transformers/models/led/configuration_led.py +7 -8
  482. transformers/models/levit/image_processing_levit_fast.py +4 -4
  483. transformers/models/lfm2/configuration_lfm2.py +5 -7
  484. transformers/models/lfm2/modeling_lfm2.py +4 -4
  485. transformers/models/lfm2/modular_lfm2.py +3 -3
  486. transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
  487. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
  488. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  489. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
  490. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
  491. transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
  492. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
  493. transformers/models/lightglue/modeling_lightglue.py +3 -3
  494. transformers/models/lightglue/modular_lightglue.py +3 -3
  495. transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
  496. transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
  497. transformers/models/lilt/configuration_lilt.py +6 -1
  498. transformers/models/llama/configuration_llama.py +5 -7
  499. transformers/models/llama/modeling_llama.py +4 -4
  500. transformers/models/llama4/configuration_llama4.py +67 -47
  501. transformers/models/llama4/image_processing_llama4_fast.py +3 -3
  502. transformers/models/llama4/modeling_llama4.py +46 -44
  503. transformers/models/llava/configuration_llava.py +10 -0
  504. transformers/models/llava/image_processing_llava_fast.py +3 -3
  505. transformers/models/llava/modeling_llava.py +38 -65
  506. transformers/models/llava_next/configuration_llava_next.py +2 -1
  507. transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
  508. transformers/models/llava_next/modeling_llava_next.py +61 -60
  509. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
  510. transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
  511. transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
  512. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
  513. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
  514. transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
  515. transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
  516. transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
  517. transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
  518. transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
  519. transformers/models/longformer/configuration_longformer.py +4 -1
  520. transformers/models/longt5/configuration_longt5.py +9 -6
  521. transformers/models/longt5/modeling_longt5.py +2 -1
  522. transformers/models/luke/configuration_luke.py +8 -1
  523. transformers/models/lw_detr/configuration_lw_detr.py +19 -31
  524. transformers/models/lw_detr/modeling_lw_detr.py +43 -44
  525. transformers/models/lw_detr/modular_lw_detr.py +36 -38
  526. transformers/models/lxmert/configuration_lxmert.py +16 -0
  527. transformers/models/m2m_100/configuration_m2m_100.py +7 -8
  528. transformers/models/m2m_100/modeling_m2m_100.py +3 -3
  529. transformers/models/mamba/configuration_mamba.py +5 -2
  530. transformers/models/mamba/modeling_mamba.py +18 -26
  531. transformers/models/mamba2/configuration_mamba2.py +5 -7
  532. transformers/models/mamba2/modeling_mamba2.py +22 -33
  533. transformers/models/marian/configuration_marian.py +10 -4
  534. transformers/models/marian/modeling_marian.py +4 -4
  535. transformers/models/markuplm/configuration_markuplm.py +4 -6
  536. transformers/models/markuplm/modeling_markuplm.py +3 -3
  537. transformers/models/mask2former/configuration_mask2former.py +12 -47
  538. transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
  539. transformers/models/mask2former/modeling_mask2former.py +18 -12
  540. transformers/models/maskformer/configuration_maskformer.py +14 -45
  541. transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
  542. transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
  543. transformers/models/maskformer/modeling_maskformer.py +15 -9
  544. transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
  545. transformers/models/mbart/configuration_mbart.py +9 -4
  546. transformers/models/mbart/modeling_mbart.py +9 -6
  547. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
  548. transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
  549. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  550. transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
  551. transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
  552. transformers/models/mgp_str/modeling_mgp_str.py +4 -2
  553. transformers/models/mimi/configuration_mimi.py +4 -0
  554. transformers/models/mimi/modeling_mimi.py +40 -36
  555. transformers/models/minimax/configuration_minimax.py +8 -11
  556. transformers/models/minimax/modeling_minimax.py +5 -5
  557. transformers/models/minimax/modular_minimax.py +9 -12
  558. transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
  559. transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
  560. transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
  561. transformers/models/ministral/configuration_ministral.py +5 -7
  562. transformers/models/ministral/modeling_ministral.py +4 -4
  563. transformers/models/ministral/modular_ministral.py +5 -8
  564. transformers/models/ministral3/configuration_ministral3.py +4 -4
  565. transformers/models/ministral3/modeling_ministral3.py +4 -4
  566. transformers/models/ministral3/modular_ministral3.py +3 -3
  567. transformers/models/mistral/configuration_mistral.py +5 -7
  568. transformers/models/mistral/modeling_mistral.py +4 -4
  569. transformers/models/mistral/modular_mistral.py +3 -3
  570. transformers/models/mistral3/configuration_mistral3.py +4 -0
  571. transformers/models/mistral3/modeling_mistral3.py +36 -40
  572. transformers/models/mistral3/modular_mistral3.py +31 -32
  573. transformers/models/mixtral/configuration_mixtral.py +8 -11
  574. transformers/models/mixtral/modeling_mixtral.py +4 -4
  575. transformers/models/mlcd/modeling_mlcd.py +7 -5
  576. transformers/models/mlcd/modular_mlcd.py +7 -5
  577. transformers/models/mllama/configuration_mllama.py +5 -7
  578. transformers/models/mllama/image_processing_mllama_fast.py +6 -5
  579. transformers/models/mllama/modeling_mllama.py +19 -19
  580. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
  581. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
  582. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
  583. transformers/models/mobilebert/configuration_mobilebert.py +4 -1
  584. transformers/models/mobilebert/modeling_mobilebert.py +3 -3
  585. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
  586. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
  587. transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
  588. transformers/models/mobilevit/modeling_mobilevit.py +4 -2
  589. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
  590. transformers/models/modernbert/configuration_modernbert.py +46 -21
  591. transformers/models/modernbert/modeling_modernbert.py +146 -899
  592. transformers/models/modernbert/modular_modernbert.py +185 -908
  593. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
  594. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
  595. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
  596. transformers/models/moonshine/configuration_moonshine.py +12 -7
  597. transformers/models/moonshine/modeling_moonshine.py +7 -7
  598. transformers/models/moonshine/modular_moonshine.py +19 -13
  599. transformers/models/moshi/configuration_moshi.py +28 -2
  600. transformers/models/moshi/modeling_moshi.py +4 -9
  601. transformers/models/mpnet/configuration_mpnet.py +6 -1
  602. transformers/models/mpt/configuration_mpt.py +16 -0
  603. transformers/models/mra/configuration_mra.py +8 -1
  604. transformers/models/mt5/configuration_mt5.py +9 -5
  605. transformers/models/mt5/modeling_mt5.py +5 -8
  606. transformers/models/musicgen/configuration_musicgen.py +12 -7
  607. transformers/models/musicgen/modeling_musicgen.py +6 -5
  608. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
  609. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
  610. transformers/models/mvp/configuration_mvp.py +8 -4
  611. transformers/models/mvp/modeling_mvp.py +6 -4
  612. transformers/models/nanochat/configuration_nanochat.py +5 -7
  613. transformers/models/nanochat/modeling_nanochat.py +4 -4
  614. transformers/models/nanochat/modular_nanochat.py +4 -4
  615. transformers/models/nemotron/configuration_nemotron.py +5 -7
  616. transformers/models/nemotron/modeling_nemotron.py +4 -14
  617. transformers/models/nllb/tokenization_nllb.py +7 -5
  618. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
  619. transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
  620. transformers/models/nougat/image_processing_nougat_fast.py +8 -8
  621. transformers/models/nystromformer/configuration_nystromformer.py +8 -1
  622. transformers/models/olmo/configuration_olmo.py +5 -7
  623. transformers/models/olmo/modeling_olmo.py +4 -4
  624. transformers/models/olmo/modular_olmo.py +3 -3
  625. transformers/models/olmo2/configuration_olmo2.py +9 -11
  626. transformers/models/olmo2/modeling_olmo2.py +4 -4
  627. transformers/models/olmo2/modular_olmo2.py +7 -7
  628. transformers/models/olmo3/configuration_olmo3.py +10 -11
  629. transformers/models/olmo3/modeling_olmo3.py +4 -4
  630. transformers/models/olmo3/modular_olmo3.py +13 -14
  631. transformers/models/olmoe/configuration_olmoe.py +5 -7
  632. transformers/models/olmoe/modeling_olmoe.py +4 -4
  633. transformers/models/olmoe/modular_olmoe.py +3 -3
  634. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
  635. transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
  636. transformers/models/oneformer/configuration_oneformer.py +9 -46
  637. transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
  638. transformers/models/oneformer/modeling_oneformer.py +14 -9
  639. transformers/models/openai/configuration_openai.py +16 -0
  640. transformers/models/opt/configuration_opt.py +6 -6
  641. transformers/models/opt/modeling_opt.py +5 -5
  642. transformers/models/ovis2/configuration_ovis2.py +4 -0
  643. transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
  644. transformers/models/ovis2/modeling_ovis2.py +58 -99
  645. transformers/models/ovis2/modular_ovis2.py +52 -13
  646. transformers/models/owlv2/configuration_owlv2.py +4 -1
  647. transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
  648. transformers/models/owlv2/modeling_owlv2.py +40 -27
  649. transformers/models/owlv2/modular_owlv2.py +5 -5
  650. transformers/models/owlvit/configuration_owlvit.py +4 -1
  651. transformers/models/owlvit/modeling_owlvit.py +40 -27
  652. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
  653. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
  654. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
  655. transformers/models/paligemma/configuration_paligemma.py +4 -0
  656. transformers/models/paligemma/modeling_paligemma.py +30 -26
  657. transformers/models/parakeet/configuration_parakeet.py +2 -4
  658. transformers/models/parakeet/modeling_parakeet.py +3 -3
  659. transformers/models/parakeet/modular_parakeet.py +3 -3
  660. transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
  661. transformers/models/patchtst/modeling_patchtst.py +3 -3
  662. transformers/models/pe_audio/modeling_pe_audio.py +4 -4
  663. transformers/models/pe_audio/modular_pe_audio.py +1 -1
  664. transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
  665. transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
  666. transformers/models/pe_video/modeling_pe_video.py +36 -24
  667. transformers/models/pe_video/modular_pe_video.py +36 -23
  668. transformers/models/pegasus/configuration_pegasus.py +8 -5
  669. transformers/models/pegasus/modeling_pegasus.py +4 -4
  670. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
  671. transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
  672. transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
  673. transformers/models/perceiver/modeling_perceiver.py +17 -9
  674. transformers/models/perception_lm/modeling_perception_lm.py +26 -27
  675. transformers/models/perception_lm/modular_perception_lm.py +27 -25
  676. transformers/models/persimmon/configuration_persimmon.py +5 -7
  677. transformers/models/persimmon/modeling_persimmon.py +5 -5
  678. transformers/models/phi/configuration_phi.py +8 -6
  679. transformers/models/phi/modeling_phi.py +4 -4
  680. transformers/models/phi/modular_phi.py +3 -3
  681. transformers/models/phi3/configuration_phi3.py +9 -11
  682. transformers/models/phi3/modeling_phi3.py +4 -4
  683. transformers/models/phi3/modular_phi3.py +3 -3
  684. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
  685. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
  686. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
  687. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
  688. transformers/models/phimoe/configuration_phimoe.py +5 -7
  689. transformers/models/phimoe/modeling_phimoe.py +15 -39
  690. transformers/models/phimoe/modular_phimoe.py +12 -7
  691. transformers/models/pix2struct/configuration_pix2struct.py +12 -9
  692. transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
  693. transformers/models/pix2struct/modeling_pix2struct.py +14 -7
  694. transformers/models/pixio/configuration_pixio.py +2 -4
  695. transformers/models/pixio/modeling_pixio.py +9 -8
  696. transformers/models/pixio/modular_pixio.py +4 -2
  697. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
  698. transformers/models/pixtral/modeling_pixtral.py +9 -12
  699. transformers/models/plbart/configuration_plbart.py +8 -5
  700. transformers/models/plbart/modeling_plbart.py +9 -7
  701. transformers/models/plbart/modular_plbart.py +1 -1
  702. transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
  703. transformers/models/pop2piano/configuration_pop2piano.py +7 -6
  704. transformers/models/pop2piano/modeling_pop2piano.py +2 -1
  705. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  706. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  707. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  708. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  709. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  710. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
  711. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
  712. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
  713. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
  714. transformers/models/prophetnet/configuration_prophetnet.py +11 -10
  715. transformers/models/prophetnet/modeling_prophetnet.py +12 -23
  716. transformers/models/pvt/image_processing_pvt.py +7 -7
  717. transformers/models/pvt/image_processing_pvt_fast.py +1 -1
  718. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  719. transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
  720. transformers/models/qwen2/configuration_qwen2.py +14 -4
  721. transformers/models/qwen2/modeling_qwen2.py +4 -4
  722. transformers/models/qwen2/modular_qwen2.py +3 -3
  723. transformers/models/qwen2/tokenization_qwen2.py +0 -4
  724. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
  725. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
  726. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
  727. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
  728. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
  729. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
  730. transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
  731. transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
  732. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  733. transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
  734. transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
  735. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
  736. transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
  737. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
  738. transformers/models/qwen3/configuration_qwen3.py +15 -5
  739. transformers/models/qwen3/modeling_qwen3.py +4 -4
  740. transformers/models/qwen3/modular_qwen3.py +3 -3
  741. transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
  742. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  743. transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
  744. transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
  745. transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
  746. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
  747. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
  748. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
  749. transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
  750. transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
  751. transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
  752. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
  753. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
  754. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
  755. transformers/models/rag/configuration_rag.py +6 -6
  756. transformers/models/rag/modeling_rag.py +3 -3
  757. transformers/models/rag/retrieval_rag.py +1 -1
  758. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
  759. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
  760. transformers/models/reformer/configuration_reformer.py +7 -7
  761. transformers/models/rembert/configuration_rembert.py +8 -1
  762. transformers/models/rembert/modeling_rembert.py +0 -22
  763. transformers/models/resnet/configuration_resnet.py +2 -4
  764. transformers/models/resnet/modeling_resnet.py +6 -5
  765. transformers/models/roberta/configuration_roberta.py +11 -2
  766. transformers/models/roberta/modeling_roberta.py +6 -6
  767. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
  768. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
  769. transformers/models/roc_bert/configuration_roc_bert.py +8 -1
  770. transformers/models/roc_bert/modeling_roc_bert.py +6 -41
  771. transformers/models/roformer/configuration_roformer.py +13 -2
  772. transformers/models/roformer/modeling_roformer.py +0 -14
  773. transformers/models/rt_detr/configuration_rt_detr.py +8 -49
  774. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
  775. transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
  776. transformers/models/rt_detr/modeling_rt_detr.py +578 -737
  777. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
  778. transformers/models/rt_detr/modular_rt_detr.py +1508 -6
  779. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
  780. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
  781. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
  782. transformers/models/rwkv/configuration_rwkv.py +2 -3
  783. transformers/models/rwkv/modeling_rwkv.py +0 -23
  784. transformers/models/sam/configuration_sam.py +2 -0
  785. transformers/models/sam/image_processing_sam_fast.py +4 -4
  786. transformers/models/sam/modeling_sam.py +13 -8
  787. transformers/models/sam/processing_sam.py +3 -3
  788. transformers/models/sam2/configuration_sam2.py +1 -1
  789. transformers/models/sam2/modeling_sam2.py +56 -52
  790. transformers/models/sam2/modular_sam2.py +47 -55
  791. transformers/models/sam2_video/modeling_sam2_video.py +50 -51
  792. transformers/models/sam2_video/modular_sam2_video.py +12 -10
  793. transformers/models/sam3/modeling_sam3.py +43 -47
  794. transformers/models/sam3/processing_sam3.py +8 -4
  795. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
  796. transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
  797. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  798. transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
  799. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
  800. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
  801. transformers/models/sam3_video/modeling_sam3_video.py +27 -14
  802. transformers/models/sam_hq/configuration_sam_hq.py +2 -0
  803. transformers/models/sam_hq/modeling_sam_hq.py +13 -9
  804. transformers/models/sam_hq/modular_sam_hq.py +6 -6
  805. transformers/models/sam_hq/processing_sam_hq.py +7 -6
  806. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
  807. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
  808. transformers/models/seed_oss/configuration_seed_oss.py +7 -9
  809. transformers/models/seed_oss/modeling_seed_oss.py +4 -4
  810. transformers/models/seed_oss/modular_seed_oss.py +3 -3
  811. transformers/models/segformer/image_processing_segformer_fast.py +4 -4
  812. transformers/models/segformer/modeling_segformer.py +4 -2
  813. transformers/models/segformer/modular_segformer.py +3 -3
  814. transformers/models/seggpt/modeling_seggpt.py +20 -8
  815. transformers/models/sew/configuration_sew.py +4 -1
  816. transformers/models/sew/modeling_sew.py +9 -5
  817. transformers/models/sew/modular_sew.py +2 -1
  818. transformers/models/sew_d/configuration_sew_d.py +4 -1
  819. transformers/models/sew_d/modeling_sew_d.py +4 -1
  820. transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
  821. transformers/models/siglip/configuration_siglip.py +4 -1
  822. transformers/models/siglip/modeling_siglip.py +27 -71
  823. transformers/models/siglip2/__init__.py +1 -0
  824. transformers/models/siglip2/configuration_siglip2.py +4 -2
  825. transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
  826. transformers/models/siglip2/modeling_siglip2.py +37 -78
  827. transformers/models/siglip2/modular_siglip2.py +74 -25
  828. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  829. transformers/models/smollm3/configuration_smollm3.py +6 -6
  830. transformers/models/smollm3/modeling_smollm3.py +4 -4
  831. transformers/models/smollm3/modular_smollm3.py +9 -9
  832. transformers/models/smolvlm/configuration_smolvlm.py +1 -3
  833. transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
  834. transformers/models/smolvlm/modeling_smolvlm.py +75 -46
  835. transformers/models/smolvlm/modular_smolvlm.py +36 -23
  836. transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
  837. transformers/models/solar_open/__init__.py +27 -0
  838. transformers/models/solar_open/configuration_solar_open.py +184 -0
  839. transformers/models/solar_open/modeling_solar_open.py +642 -0
  840. transformers/models/solar_open/modular_solar_open.py +224 -0
  841. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
  842. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
  843. transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
  844. transformers/models/speecht5/configuration_speecht5.py +7 -8
  845. transformers/models/splinter/configuration_splinter.py +6 -6
  846. transformers/models/splinter/modeling_splinter.py +8 -3
  847. transformers/models/squeezebert/configuration_squeezebert.py +14 -1
  848. transformers/models/stablelm/configuration_stablelm.py +8 -6
  849. transformers/models/stablelm/modeling_stablelm.py +5 -5
  850. transformers/models/starcoder2/configuration_starcoder2.py +11 -5
  851. transformers/models/starcoder2/modeling_starcoder2.py +5 -5
  852. transformers/models/starcoder2/modular_starcoder2.py +4 -4
  853. transformers/models/superglue/configuration_superglue.py +4 -0
  854. transformers/models/superglue/image_processing_superglue_fast.py +4 -3
  855. transformers/models/superglue/modeling_superglue.py +9 -4
  856. transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
  857. transformers/models/superpoint/modeling_superpoint.py +4 -2
  858. transformers/models/swin/configuration_swin.py +2 -4
  859. transformers/models/swin/modeling_swin.py +11 -8
  860. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
  861. transformers/models/swin2sr/modeling_swin2sr.py +4 -2
  862. transformers/models/swinv2/configuration_swinv2.py +2 -4
  863. transformers/models/swinv2/modeling_swinv2.py +10 -7
  864. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
  865. transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
  866. transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
  867. transformers/models/t5/configuration_t5.py +9 -8
  868. transformers/models/t5/modeling_t5.py +5 -8
  869. transformers/models/t5gemma/configuration_t5gemma.py +10 -25
  870. transformers/models/t5gemma/modeling_t5gemma.py +9 -9
  871. transformers/models/t5gemma/modular_t5gemma.py +11 -24
  872. transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
  873. transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
  874. transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
  875. transformers/models/table_transformer/configuration_table_transformer.py +18 -49
  876. transformers/models/table_transformer/modeling_table_transformer.py +27 -53
  877. transformers/models/tapas/configuration_tapas.py +12 -1
  878. transformers/models/tapas/modeling_tapas.py +1 -1
  879. transformers/models/tapas/tokenization_tapas.py +1 -0
  880. transformers/models/textnet/configuration_textnet.py +4 -6
  881. transformers/models/textnet/image_processing_textnet_fast.py +3 -3
  882. transformers/models/textnet/modeling_textnet.py +15 -14
  883. transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
  884. transformers/models/timesfm/modeling_timesfm.py +5 -6
  885. transformers/models/timesfm/modular_timesfm.py +5 -6
  886. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
  887. transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
  888. transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
  889. transformers/models/trocr/configuration_trocr.py +11 -7
  890. transformers/models/trocr/modeling_trocr.py +4 -2
  891. transformers/models/tvp/configuration_tvp.py +10 -35
  892. transformers/models/tvp/image_processing_tvp_fast.py +6 -5
  893. transformers/models/tvp/modeling_tvp.py +1 -1
  894. transformers/models/udop/configuration_udop.py +16 -7
  895. transformers/models/udop/modeling_udop.py +10 -6
  896. transformers/models/umt5/configuration_umt5.py +8 -6
  897. transformers/models/umt5/modeling_umt5.py +7 -3
  898. transformers/models/unispeech/configuration_unispeech.py +4 -1
  899. transformers/models/unispeech/modeling_unispeech.py +7 -4
  900. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
  901. transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
  902. transformers/models/upernet/configuration_upernet.py +8 -35
  903. transformers/models/upernet/modeling_upernet.py +1 -1
  904. transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
  905. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  906. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  907. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
  908. transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
  909. transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
  910. transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
  911. transformers/models/video_llava/configuration_video_llava.py +4 -0
  912. transformers/models/video_llava/modeling_video_llava.py +87 -89
  913. transformers/models/videomae/modeling_videomae.py +4 -5
  914. transformers/models/vilt/configuration_vilt.py +4 -1
  915. transformers/models/vilt/image_processing_vilt_fast.py +6 -6
  916. transformers/models/vilt/modeling_vilt.py +27 -12
  917. transformers/models/vipllava/configuration_vipllava.py +4 -0
  918. transformers/models/vipllava/modeling_vipllava.py +57 -31
  919. transformers/models/vipllava/modular_vipllava.py +50 -24
  920. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
  921. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
  922. transformers/models/visual_bert/configuration_visual_bert.py +6 -1
  923. transformers/models/vit/configuration_vit.py +2 -2
  924. transformers/models/vit/modeling_vit.py +7 -5
  925. transformers/models/vit_mae/modeling_vit_mae.py +11 -7
  926. transformers/models/vit_msn/modeling_vit_msn.py +11 -7
  927. transformers/models/vitdet/configuration_vitdet.py +2 -4
  928. transformers/models/vitdet/modeling_vitdet.py +2 -3
  929. transformers/models/vitmatte/configuration_vitmatte.py +6 -35
  930. transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
  931. transformers/models/vitmatte/modeling_vitmatte.py +1 -1
  932. transformers/models/vitpose/configuration_vitpose.py +6 -43
  933. transformers/models/vitpose/modeling_vitpose.py +5 -3
  934. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
  935. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
  936. transformers/models/vits/configuration_vits.py +4 -0
  937. transformers/models/vits/modeling_vits.py +9 -7
  938. transformers/models/vivit/modeling_vivit.py +4 -4
  939. transformers/models/vjepa2/modeling_vjepa2.py +9 -9
  940. transformers/models/voxtral/configuration_voxtral.py +0 -1
  941. transformers/models/voxtral/modeling_voxtral.py +25 -24
  942. transformers/models/voxtral/modular_voxtral.py +26 -20
  943. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
  944. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
  945. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
  946. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
  947. transformers/models/wavlm/configuration_wavlm.py +4 -1
  948. transformers/models/wavlm/modeling_wavlm.py +4 -1
  949. transformers/models/whisper/configuration_whisper.py +6 -4
  950. transformers/models/whisper/generation_whisper.py +0 -1
  951. transformers/models/whisper/modeling_whisper.py +3 -3
  952. transformers/models/x_clip/configuration_x_clip.py +4 -1
  953. transformers/models/x_clip/modeling_x_clip.py +26 -27
  954. transformers/models/xglm/configuration_xglm.py +9 -7
  955. transformers/models/xlm/configuration_xlm.py +10 -7
  956. transformers/models/xlm/modeling_xlm.py +1 -1
  957. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
  958. transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
  959. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
  960. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
  961. transformers/models/xlnet/configuration_xlnet.py +3 -1
  962. transformers/models/xlstm/configuration_xlstm.py +5 -7
  963. transformers/models/xlstm/modeling_xlstm.py +0 -32
  964. transformers/models/xmod/configuration_xmod.py +11 -2
  965. transformers/models/xmod/modeling_xmod.py +13 -16
  966. transformers/models/yolos/image_processing_yolos_fast.py +25 -28
  967. transformers/models/yolos/modeling_yolos.py +7 -7
  968. transformers/models/yolos/modular_yolos.py +16 -16
  969. transformers/models/yoso/configuration_yoso.py +8 -1
  970. transformers/models/youtu/__init__.py +27 -0
  971. transformers/models/youtu/configuration_youtu.py +194 -0
  972. transformers/models/youtu/modeling_youtu.py +619 -0
  973. transformers/models/youtu/modular_youtu.py +254 -0
  974. transformers/models/zamba/configuration_zamba.py +5 -7
  975. transformers/models/zamba/modeling_zamba.py +25 -56
  976. transformers/models/zamba2/configuration_zamba2.py +8 -13
  977. transformers/models/zamba2/modeling_zamba2.py +53 -78
  978. transformers/models/zamba2/modular_zamba2.py +36 -29
  979. transformers/models/zoedepth/configuration_zoedepth.py +17 -40
  980. transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
  981. transformers/models/zoedepth/modeling_zoedepth.py +5 -3
  982. transformers/pipelines/__init__.py +1 -61
  983. transformers/pipelines/any_to_any.py +1 -1
  984. transformers/pipelines/automatic_speech_recognition.py +0 -2
  985. transformers/pipelines/base.py +1 -1
  986. transformers/pipelines/image_text_to_text.py +1 -1
  987. transformers/pipelines/text_to_audio.py +5 -1
  988. transformers/processing_utils.py +35 -44
  989. transformers/pytorch_utils.py +2 -26
  990. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  991. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  992. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  993. transformers/quantizers/quantizer_mxfp4.py +1 -1
  994. transformers/quantizers/quantizer_torchao.py +0 -16
  995. transformers/safetensors_conversion.py +11 -4
  996. transformers/testing_utils.py +3 -28
  997. transformers/tokenization_mistral_common.py +9 -0
  998. transformers/tokenization_python.py +6 -4
  999. transformers/tokenization_utils_base.py +119 -219
  1000. transformers/tokenization_utils_tokenizers.py +31 -2
  1001. transformers/trainer.py +25 -33
  1002. transformers/trainer_seq2seq.py +1 -1
  1003. transformers/training_args.py +411 -417
  1004. transformers/utils/__init__.py +1 -4
  1005. transformers/utils/auto_docstring.py +15 -18
  1006. transformers/utils/backbone_utils.py +13 -373
  1007. transformers/utils/doc.py +4 -36
  1008. transformers/utils/generic.py +69 -33
  1009. transformers/utils/import_utils.py +72 -75
  1010. transformers/utils/loading_report.py +133 -105
  1011. transformers/utils/quantization_config.py +0 -21
  1012. transformers/video_processing_utils.py +5 -5
  1013. transformers/video_utils.py +3 -1
  1014. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
  1015. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
  1016. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1017. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1018. transformers/pipelines/image_to_text.py +0 -189
  1019. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1020. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1021. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -40,7 +40,7 @@ from .image_processing_dpt import DPTImageProcessorKwargs
40
40
  if TYPE_CHECKING:
41
41
  from ...modeling_outputs import DepthEstimatorOutput
42
42
 
43
- from torchvision.transforms.v2 import functional as F
43
+ import torchvision.transforms.v2.functional as tvF
44
44
 
45
45
 
46
46
  def get_resize_output_image_size(
@@ -105,7 +105,7 @@ class DPTImageProcessorFast(BeitImageProcessorFast):
105
105
  self,
106
106
  image: "torch.Tensor",
107
107
  size: SizeDict,
108
- interpolation: Optional["F.InterpolationMode"] = None,
108
+ interpolation: Optional["tvF.InterpolationMode"] = None,
109
109
  antialias: bool = True,
110
110
  ensure_multiple_of: int | None = 1,
111
111
  keep_aspect_ratio: bool = False,
@@ -169,7 +169,7 @@ class DPTImageProcessorFast(BeitImageProcessorFast):
169
169
  pad_top, pad_bottom = _get_pad(height, size_divisor)
170
170
  pad_left, pad_right = _get_pad(width, size_divisor)
171
171
  padding = (pad_left, pad_top, pad_right, pad_bottom)
172
- return F.pad(image, padding)
172
+ return tvF.pad(image, padding)
173
173
 
174
174
  def _preprocess(
175
175
  self,
@@ -177,7 +177,7 @@ class DPTImageProcessorFast(BeitImageProcessorFast):
177
177
  do_reduce_labels: bool,
178
178
  do_resize: bool,
179
179
  size: SizeDict,
180
- interpolation: Optional["F.InterpolationMode"],
180
+ interpolation: Optional["tvF.InterpolationMode"],
181
181
  do_center_crop: bool,
182
182
  crop_size: SizeDict,
183
183
  do_rescale: bool,
@@ -271,7 +271,7 @@ class EdgeTamConfig(PreTrainedConfig):
271
271
  ... )
272
272
 
273
273
  >>> # Initializing a EdgeTamConfig with `"facebook/edgetam.1_hiera_tiny"` style configuration
274
- >>> configuration = EdgeTamconfig()
274
+ >>> configuration = EdgeTamConfig()
275
275
 
276
276
  >>> # Initializing a EdgeTamModel (with random weights) from the `"facebook/edgetam.1_hiera_tiny"` style configuration
277
277
  >>> model = EdgeTamModel(configuration)
@@ -32,12 +32,13 @@ from transformers.utils.generic import OutputRecorder
32
32
 
33
33
  from ... import initialization as init
34
34
  from ...activations import ACT2FN
35
- from ...modeling_outputs import BaseModelOutput
35
+ from ...modeling_layers import GradientCheckpointingLayer
36
+ from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
36
37
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
37
38
  from ...processing_utils import Unpack
38
39
  from ...pytorch_utils import compile_compatible_method_lru_cache
39
- from ...utils import ModelOutput, auto_docstring
40
- from ...utils.generic import TransformersKwargs, check_model_inputs
40
+ from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging
41
+ from ...utils.generic import TransformersKwargs, check_model_inputs, is_flash_attention_requested
41
42
  from ..auto import AutoModel
42
43
  from .configuration_edgetam import (
43
44
  EdgeTamConfig,
@@ -47,9 +48,7 @@ from .configuration_edgetam import (
47
48
  )
48
49
 
49
50
 
50
- # fix this in modular
51
- if True:
52
- from ..timm_wrapper.modeling_timm_wrapper import TimmWrapperModel
51
+ logger = logging.get_logger(__name__)
53
52
 
54
53
 
55
54
  class EdgeTamLayerNorm(nn.LayerNorm):
@@ -80,16 +79,10 @@ class EdgeTamLayerNorm(nn.LayerNorm):
80
79
 
81
80
  @dataclass
82
81
  @auto_docstring(custom_intro="Base class for the vision encoder's outputs.")
83
- class EdgeTamVisionEncoderOutput(ModelOutput):
82
+ class EdgeTamVisionEncoderOutput(BaseModelOutputWithPooling):
84
83
  r"""
85
84
  last_hidden_state (`torch.FloatTensor` of shape `(batch_size, height, width, hidden_size)`):
86
85
  Sequence of hidden-states at the output of the last layer of the model.
87
- fpn_hidden_states (`tuple(torch.FloatTensor)`):
88
- Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
89
- `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
90
- fpn_position_encoding (`tuple(torch.FloatTensor)`):
91
- Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
92
- `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
93
86
  hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
94
87
  Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
95
88
  one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. Hidden-states of the
@@ -98,13 +91,16 @@ class EdgeTamVisionEncoderOutput(ModelOutput):
98
91
  Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
99
92
  sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
100
93
  the self-attention heads.
94
+ fpn_hidden_states (`tuple(torch.FloatTensor)`):
95
+ Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
96
+ `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
97
+ fpn_position_encoding (`tuple(torch.FloatTensor)`):
98
+ Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
99
+ `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
101
100
  """
102
101
 
103
- last_hidden_state: torch.FloatTensor | None = None
104
102
  fpn_hidden_states: torch.FloatTensor | None = None
105
103
  fpn_position_encoding: torch.FloatTensor | None = None
106
- hidden_states: tuple[torch.FloatTensor, ...] | None = None
107
- attentions: tuple[torch.FloatTensor, ...] | None = None
108
104
 
109
105
 
110
106
  def eager_attention_forward(
@@ -167,9 +163,18 @@ class EdgeTamAttention(nn.Module):
167
163
  key = self.k_proj(key).view(*new_shape).transpose(1, 2)
168
164
  value = self.v_proj(value).view(*new_shape).transpose(1, 2)
169
165
 
170
- attention_interface: Callable = eager_attention_forward
171
- if self.config._attn_implementation != "eager":
172
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
166
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
167
+ self.config._attn_implementation, eager_attention_forward
168
+ )
169
+
170
+ if is_flash_attention_requested(self.config) and attention_similarity is not None:
171
+ # Target guided masks are represented as float masks and are incompatible with Flash Attention
172
+ # Fallback to SDPA for this call only so the rest of the model can still benefit from FA
173
+ attention_interface = ALL_ATTENTION_FUNCTIONS["sdpa"]
174
+ logger.warning_once(
175
+ "Falling back to SDPA for target-guided attention because "
176
+ "Flash Attention does not support additive bias masks."
177
+ )
173
178
 
174
179
  attn_output, attn_weights = attention_interface(
175
180
  self,
@@ -191,7 +196,7 @@ class EdgeTamAttention(nn.Module):
191
196
  return attn_output, attn_weights
192
197
 
193
198
 
194
- class EdgeTamTwoWayAttentionBlock(nn.Module):
199
+ class EdgeTamTwoWayAttentionBlock(GradientCheckpointingLayer):
195
200
  def __init__(self, config: EdgeTamMaskDecoderConfig, skip_first_layer_pe: bool = False):
196
201
  """
197
202
  A transformer block with four layers:
@@ -305,7 +310,7 @@ class EdgeTamPreTrainedModel(PreTrainedModel):
305
310
  main_input_name = "pixel_values"
306
311
  input_modalities = ("image",)
307
312
  _supports_sdpa = True
308
- _supports_flash_attn_2 = True
313
+ _supports_flash_attn = True
309
314
  _supports_attention_backend = True
310
315
 
311
316
  @torch.no_grad()
@@ -425,7 +430,9 @@ class EdgeTamVisionNeck(nn.Module):
425
430
  class EdgeTamVisionModel(EdgeTamPreTrainedModel):
426
431
  config_class = EdgeTamVisionConfig
427
432
  main_input_name = "pixel_values"
428
- _can_record_outputs = {"hidden_states": TimmWrapperModel, "attentions": TimmWrapperModel}
433
+ # TODO: TimmWrapper models aren't compatible with _can_record_outputs yet. We specifically set this to
434
+ # an empty dict to avoid the _can_record_outputs from Sam2VisionModel being inherited here.
435
+ _can_record_outputs = {}
429
436
 
430
437
  def __init__(self, config: EdgeTamVisionConfig):
431
438
  super().__init__(config)
@@ -448,7 +455,7 @@ class EdgeTamVisionModel(EdgeTamPreTrainedModel):
448
455
  raise ValueError("You have to specify pixel_values")
449
456
 
450
457
  # Forward through backbone
451
- backbone_output = self.backbone(pixel_values)
458
+ backbone_output = self.backbone(pixel_values, **kwargs)
452
459
  intermediate_hidden_states = backbone_output.last_hidden_state
453
460
  intermediate_hidden_states = [hidden_state.permute(0, 2, 3, 1) for hidden_state in intermediate_hidden_states]
454
461
 
@@ -461,6 +468,7 @@ class EdgeTamVisionModel(EdgeTamPreTrainedModel):
461
468
  last_hidden_state=intermediate_hidden_states[-1],
462
469
  fpn_hidden_states=fpn_hidden_states,
463
470
  fpn_position_encoding=fpn_position_encoding,
471
+ hidden_states=backbone_output.hidden_states,
464
472
  )
465
473
 
466
474
 
@@ -914,6 +922,7 @@ class EdgeTamMaskDecoder(nn.Module):
914
922
  class EdgeTamModel(EdgeTamPreTrainedModel):
915
923
  input_modalities = ("image", "text")
916
924
  _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(EdgeTamTwoWayAttentionBlock, index=2)}
925
+ _tied_weights_keys = {}
917
926
  _keys_to_ignore_on_load_unexpected = [
918
927
  r"^memory_.*",
919
928
  r"^mask_downsample.*",
@@ -969,7 +978,8 @@ class EdgeTamModel(EdgeTamPreTrainedModel):
969
978
  Input pixel values
970
979
  """
971
980
  batch_size = pixel_values.shape[0]
972
- feature_maps, _, _, _ = self.get_image_features(pixel_values, **kwargs)
981
+ image_outputs = self.get_image_features(pixel_values, return_dict=True, **kwargs)
982
+ feature_maps = image_outputs.fpn_hidden_states
973
983
 
974
984
  # add no memory embedding to the last feature map
975
985
  feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding
@@ -1088,14 +1098,16 @@ class EdgeTamModel(EdgeTamPreTrainedModel):
1088
1098
 
1089
1099
  ```python
1090
1100
  >>> from PIL import Image
1091
- >>> import requests
1101
+ >>> import httpx
1102
+ >>> from io import BytesIO
1092
1103
  >>> from transformers import AutoModel, AutoProcessor
1093
1104
 
1094
1105
  >>> model = AutoModel.from_pretrained("danelcsb/edgetam.1_hiera_tiny")
1095
1106
  >>> processor = AutoProcessor.from_pretrained("danelcsb/edgetam.1_hiera_tiny")
1096
1107
 
1097
- >>> img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
1098
- >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
1108
+ >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
1109
+ >>> with httpx.stream("GET", url) as response:
1110
+ ... raw_image = Image.open(BytesIO(response.read())).convert("RGB")
1099
1111
  >>> input_points = [[[400, 650]]] # 2D location of a window on the car
1100
1112
  >>> inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt")
1101
1113
 
@@ -1125,10 +1137,12 @@ class EdgeTamModel(EdgeTamPreTrainedModel):
1125
1137
  vision_hidden_states = None
1126
1138
 
1127
1139
  if pixel_values is not None:
1128
- feature_maps, _, vision_hidden_states, vision_attentions = self.get_image_features(
1129
- pixel_values,
1130
- **kwargs,
1140
+ image_outputs: EdgeTamVisionEncoderOutput = self.get_image_features(
1141
+ pixel_values, return_dict=True, **kwargs
1131
1142
  )
1143
+ feature_maps = image_outputs.fpn_hidden_states
1144
+ vision_hidden_states = image_outputs.hidden_states
1145
+ vision_attentions = image_outputs.attentions
1132
1146
 
1133
1147
  # add no memory embedding to the last feature map
1134
1148
  feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding
@@ -1188,34 +1202,18 @@ class EdgeTamModel(EdgeTamPreTrainedModel):
1188
1202
  vision_attentions=vision_attentions,
1189
1203
  )
1190
1204
 
1205
+ @can_return_tuple
1206
+ @auto_docstring
1191
1207
  def get_image_features(
1192
1208
  self,
1193
1209
  pixel_values: torch.FloatTensor,
1194
1210
  **kwargs: Unpack[TransformersKwargs],
1195
- ) -> tuple[
1196
- list[torch.Tensor],
1197
- list[torch.Tensor],
1198
- tuple[torch.FloatTensor, ...] | None,
1199
- tuple[torch.FloatTensor, ...] | None,
1200
- ]:
1211
+ ) -> tuple | EdgeTamVisionEncoderOutput:
1201
1212
  r"""
1202
- Extract and preprocess image features using the vision encoder.
1203
-
1204
- Args:
1205
- pixel_values (`torch.FloatTensor`):
1206
- Input pixel values of shape `(batch_size, num_channels, height, width)`.
1207
-
1208
- Returns:
1209
- `tuple`: A tuple containing:
1210
- - feature_maps (`list[torch.Tensor]`): List of feature maps from different levels.
1211
- - feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level.
1212
- - vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder.
1213
- - vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder.
1213
+ pixel_values (`torch.FloatTensor`):
1214
+ Input pixel values of shape `(batch_size, num_channels, height, width)`.
1214
1215
  """
1215
- vision_outputs: EdgeTamVisionEncoderOutput = self.vision_encoder(
1216
- pixel_values,
1217
- **kwargs,
1218
- )
1216
+ vision_outputs: EdgeTamVisionEncoderOutput = self.vision_encoder(pixel_values, return_dict=True, **kwargs)
1219
1217
 
1220
1218
  feature_maps = vision_outputs.fpn_hidden_states
1221
1219
  feature_maps_position_embeddings = vision_outputs.fpn_position_encoding
@@ -1232,8 +1230,10 @@ class EdgeTamModel(EdgeTamPreTrainedModel):
1232
1230
  feature_map_position_embedding.flatten(2).permute(2, 0, 1)
1233
1231
  for feature_map_position_embedding in feature_maps_position_embeddings
1234
1232
  ]
1233
+ vision_outputs.fpn_hidden_states = feature_maps
1234
+ vision_outputs.fpn_position_encoding = feature_maps_position_embeddings
1235
1235
 
1236
- return feature_maps, feature_maps_position_embeddings, vision_outputs.hidden_states, vision_outputs.attentions
1236
+ return vision_outputs
1237
1237
 
1238
1238
 
1239
1239
  __all__ = ["EdgeTamModel", "EdgeTamVisionModel", "EdgeTamPreTrainedModel"]
@@ -37,11 +37,6 @@ from ..sam2.modeling_sam2 import (
37
37
  )
38
38
 
39
39
 
40
- # fix this in modular
41
- if True:
42
- from ..timm_wrapper.modeling_timm_wrapper import TimmWrapperModel
43
-
44
-
45
40
  class EdgeTamVisionConfig(PreTrainedConfig):
46
41
  r"""
47
42
  This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM
@@ -188,7 +183,9 @@ class EdgeTamPreTrainedModel(Sam2PreTrainedModel):
188
183
  class EdgeTamVisionModel(Sam2VisionModel):
189
184
  config_class = EdgeTamVisionConfig
190
185
  main_input_name = "pixel_values"
191
- _can_record_outputs = {"hidden_states": TimmWrapperModel, "attentions": TimmWrapperModel}
186
+ # TODO: TimmWrapper models aren't compatible with _can_record_outputs yet. We specifically set this to
187
+ # an empty dict to avoid the _can_record_outputs from Sam2VisionModel being inherited here.
188
+ _can_record_outputs = {}
192
189
 
193
190
  def get_input_embeddings(self):
194
191
  raise NotImplementedError("Can't get input embeddings from timm wrapper model")
@@ -203,7 +200,7 @@ class EdgeTamVisionModel(Sam2VisionModel):
203
200
  raise ValueError("You have to specify pixel_values")
204
201
 
205
202
  # Forward through backbone
206
- backbone_output = self.backbone(pixel_values)
203
+ backbone_output = self.backbone(pixel_values, **kwargs)
207
204
  intermediate_hidden_states = backbone_output.last_hidden_state
208
205
  intermediate_hidden_states = [hidden_state.permute(0, 2, 3, 1) for hidden_state in intermediate_hidden_states]
209
206
 
@@ -216,6 +213,7 @@ class EdgeTamVisionModel(Sam2VisionModel):
216
213
  last_hidden_state=intermediate_hidden_states[-1],
217
214
  fpn_hidden_states=fpn_hidden_states,
218
215
  fpn_position_encoding=fpn_position_encoding,
216
+ hidden_states=backbone_output.hidden_states,
219
217
  )
220
218
 
221
219
 
@@ -37,12 +37,12 @@ from ... import initialization as init
37
37
  from ...activations import ACT2FN
38
38
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
39
39
  from ...modeling_layers import GradientCheckpointingLayer
40
- from ...modeling_outputs import BaseModelOutput
40
+ from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
41
41
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
42
42
  from ...processing_utils import Unpack
43
43
  from ...pytorch_utils import compile_compatible_method_lru_cache
44
- from ...utils import ModelOutput, auto_docstring
45
- from ...utils.generic import TransformersKwargs
44
+ from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging
45
+ from ...utils.generic import TransformersKwargs, is_flash_attention_requested
46
46
  from ..auto import AutoModel
47
47
  from .configuration_edgetam_video import (
48
48
  EdgeTamVideoConfig,
@@ -51,6 +51,9 @@ from .configuration_edgetam_video import (
51
51
  )
52
52
 
53
53
 
54
+ logger = logging.get_logger(__name__)
55
+
56
+
54
57
  class EdgeTamVideoLayerNorm(nn.LayerNorm):
55
58
  r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
56
59
  The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
@@ -116,16 +119,10 @@ class EdgeTamVideoMemoryFuserCXBlock(GradientCheckpointingLayer):
116
119
 
117
120
  @dataclass
118
121
  @auto_docstring(custom_intro="Base class for the vision encoder's outputs.")
119
- class EdgeTamVideoVisionEncoderOutput(ModelOutput):
122
+ class EdgeTamVideoVisionEncoderOutput(BaseModelOutputWithPooling):
120
123
  r"""
121
124
  last_hidden_state (`torch.FloatTensor` of shape `(batch_size, height, width, hidden_size)`):
122
125
  Sequence of hidden-states at the output of the last layer of the model.
123
- fpn_hidden_states (`tuple(torch.FloatTensor)`):
124
- Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
125
- `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
126
- fpn_position_encoding (`tuple(torch.FloatTensor)`):
127
- Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
128
- `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
129
126
  hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
130
127
  Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
131
128
  one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. Hidden-states of the
@@ -134,13 +131,16 @@ class EdgeTamVideoVisionEncoderOutput(ModelOutput):
134
131
  Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
135
132
  sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
136
133
  the self-attention heads.
134
+ fpn_hidden_states (`tuple(torch.FloatTensor)`):
135
+ Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
136
+ `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
137
+ fpn_position_encoding (`tuple(torch.FloatTensor)`):
138
+ Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
139
+ `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
137
140
  """
138
141
 
139
- last_hidden_state: torch.FloatTensor | None = None
140
142
  fpn_hidden_states: torch.FloatTensor | None = None
141
143
  fpn_position_encoding: torch.FloatTensor | None = None
142
- hidden_states: tuple[torch.FloatTensor, ...] | None = None
143
- attentions: tuple[torch.FloatTensor, ...] | None = None
144
144
 
145
145
 
146
146
  class EdgeTamVideoVisionRotaryEmbedding(nn.Module):
@@ -245,9 +245,18 @@ class EdgeTamVideoAttention(nn.Module):
245
245
  key = self.k_proj(key).view(*new_shape).transpose(1, 2)
246
246
  value = self.v_proj(value).view(*new_shape).transpose(1, 2)
247
247
 
248
- attention_interface: Callable = eager_attention_forward
249
- if self.config._attn_implementation != "eager":
250
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
248
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
249
+ self.config._attn_implementation, eager_attention_forward
250
+ )
251
+
252
+ if is_flash_attention_requested(self.config) and attention_similarity is not None:
253
+ # Target guided masks are represented as float masks and are incompatible with Flash Attention
254
+ # Fallback to SDPA for this call only so the rest of the model can still benefit from FA
255
+ attention_interface = ALL_ATTENTION_FUNCTIONS["sdpa"]
256
+ logger.warning_once(
257
+ "Falling back to SDPA for target-guided attention because "
258
+ "Flash Attention does not support additive bias masks."
259
+ )
251
260
 
252
261
  attn_output, attn_weights = attention_interface(
253
262
  self,
@@ -355,9 +364,9 @@ class EdgeTamVideoRoPESelfAttention(nn.Module):
355
364
  # Apply rotary position encoding for self-attention
356
365
  query, key = apply_rotary_pos_emb_2d_self_attn(query, key, cos=cos, sin=sin)
357
366
 
358
- attention_interface: Callable = eager_attention_forward
359
- if self.config._attn_implementation != "eager":
360
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
367
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
368
+ self.config._attn_implementation, eager_attention_forward
369
+ )
361
370
 
362
371
  attn_output, attn_weights = attention_interface(
363
372
  self,
@@ -506,9 +515,9 @@ class EdgeTamVideoRoPECrossAttention(nn.Module):
506
515
  num_k_exclude_rope=num_k_exclude_rope,
507
516
  )
508
517
 
509
- attention_interface: Callable = eager_attention_forward
510
- if self.config._attn_implementation != "eager":
511
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
518
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
519
+ self.config._attn_implementation, eager_attention_forward
520
+ )
512
521
 
513
522
  attn_output, attn_weights = attention_interface(
514
523
  self,
@@ -528,7 +537,7 @@ class EdgeTamVideoRoPECrossAttention(nn.Module):
528
537
  return attn_output, attn_weights
529
538
 
530
539
 
531
- class EdgeTamVideoTwoWayAttentionBlock(nn.Module):
540
+ class EdgeTamVideoTwoWayAttentionBlock(GradientCheckpointingLayer):
532
541
  def __init__(self, config: EdgeTamVideoMaskDecoderConfig, skip_first_layer_pe: bool = False):
533
542
  """
534
543
  A transformer block with four layers:
@@ -807,7 +816,7 @@ class EdgeTamVideoPreTrainedModel(PreTrainedModel):
807
816
  main_input_name = "pixel_values"
808
817
  input_modalities = "video"
809
818
  _supports_sdpa = True
810
- _supports_flash_attn_2 = True
819
+ _supports_flash_attn = True
811
820
  _supports_attention_backend = True
812
821
 
813
822
  @torch.no_grad()
@@ -1322,9 +1331,9 @@ class EdgeTamVideoPerceiverAttention(nn.Module):
1322
1331
  value = value + pos_encoding
1323
1332
 
1324
1333
  # Apply attention
1325
- attention_interface: Callable = eager_attention_forward
1326
- if self.config._attn_implementation != "eager":
1327
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
1334
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
1335
+ self.config._attn_implementation, eager_attention_forward
1336
+ )
1328
1337
 
1329
1338
  attn_output, _ = attention_interface(
1330
1339
  self,
@@ -1991,6 +2000,7 @@ def get_1d_sine_pe(pos_inds, dim, temperature=10000):
1991
2000
  class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel):
1992
2001
  input_modalities = ("video", "text")
1993
2002
  _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(EdgeTamVideoTwoWayAttentionBlock, index=2)}
2003
+ _tied_weights_keys = {}
1994
2004
  _keys_to_ignore_on_load_unexpected = []
1995
2005
 
1996
2006
  def __init__(self, config: EdgeTamVideoConfig):
@@ -2074,7 +2084,8 @@ class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel):
2074
2084
  Input pixel values
2075
2085
  """
2076
2086
  batch_size = pixel_values.shape[0]
2077
- feature_maps, _, _, _ = self.get_image_features(pixel_values, **kwargs)
2087
+ image_outputs = self.get_image_features(pixel_values, return_dict=True, **kwargs)
2088
+ feature_maps = image_outputs.fpn_hidden_states
2078
2089
 
2079
2090
  # add no memory embedding to the last feature map
2080
2091
  feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding
@@ -2219,34 +2230,18 @@ class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel):
2219
2230
  frame_idx=frame_idx,
2220
2231
  )
2221
2232
 
2233
+ @can_return_tuple
2234
+ @auto_docstring
2222
2235
  def get_image_features(
2223
2236
  self,
2224
2237
  pixel_values: torch.FloatTensor,
2225
2238
  **kwargs: Unpack[TransformersKwargs],
2226
- ) -> tuple[
2227
- list[torch.Tensor],
2228
- list[torch.Tensor],
2229
- tuple[torch.FloatTensor, ...] | None,
2230
- tuple[torch.FloatTensor, ...] | None,
2231
- ]:
2239
+ ) -> tuple | EdgeTamVideoVisionEncoderOutput:
2232
2240
  r"""
2233
- Extract and preprocess image features using the vision encoder.
2234
-
2235
- Args:
2236
- pixel_values (`torch.FloatTensor`):
2237
- Input pixel values of shape `(batch_size, num_channels, height, width)`.
2238
-
2239
- Returns:
2240
- `tuple`: A tuple containing:
2241
- - feature_maps (`list[torch.Tensor]`): List of feature maps from different levels.
2242
- - feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level.
2243
- - vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder.
2244
- - vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder.
2241
+ pixel_values (`torch.FloatTensor`):
2242
+ Input pixel values of shape `(batch_size, num_channels, height, width)`.
2245
2243
  """
2246
- vision_outputs: EdgeTamVideoVisionEncoderOutput = self.vision_encoder(
2247
- pixel_values,
2248
- **kwargs,
2249
- )
2244
+ vision_outputs: EdgeTamVideoVisionEncoderOutput = self.vision_encoder(pixel_values, return_dict=True, **kwargs)
2250
2245
 
2251
2246
  feature_maps = vision_outputs.fpn_hidden_states
2252
2247
  feature_maps_position_embeddings = vision_outputs.fpn_position_encoding
@@ -2263,8 +2258,10 @@ class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel):
2263
2258
  feature_map_position_embedding.flatten(2).permute(2, 0, 1)
2264
2259
  for feature_map_position_embedding in feature_maps_position_embeddings
2265
2260
  ]
2261
+ vision_outputs.fpn_hidden_states = feature_maps
2262
+ vision_outputs.fpn_position_encoding = feature_maps_position_embeddings
2266
2263
 
2267
- return feature_maps, feature_maps_position_embeddings, vision_outputs.hidden_states, vision_outputs.attentions
2264
+ return vision_outputs
2268
2265
 
2269
2266
  def _prepare_vision_features(
2270
2267
  self,
@@ -2281,7 +2278,9 @@ class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel):
2281
2278
  else:
2282
2279
  # Compute features using image encoder
2283
2280
  image_batch = inference_session.get_frame(frame_idx).unsqueeze(0) # Add batch dimension
2284
- vision_feats, vision_pos_embeds, _, _ = self.get_image_features(image_batch)
2281
+ image_outputs = self.get_image_features(image_batch, return_dict=True)
2282
+ vision_feats = image_outputs.fpn_hidden_states
2283
+ vision_pos_embeds = image_outputs.fpn_position_encoding
2285
2284
  # Cache features
2286
2285
  inference_session.cache.cache_vision_features(
2287
2286
  frame_idx, {"vision_feats": vision_feats, "vision_pos_embeds": vision_pos_embeds}
@@ -2386,10 +2385,10 @@ class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel):
2386
2385
  vision_hidden_states = None
2387
2386
 
2388
2387
  if pixel_values is not None:
2389
- feature_maps, _, vision_hidden_states, vision_attentions = self.get_image_features(
2390
- pixel_values,
2391
- **kwargs,
2392
- )
2388
+ image_outputs = self.get_image_features(pixel_values, return_dict=True, **kwargs)
2389
+ feature_maps = image_outputs.fpn_hidden_states
2390
+ vision_hidden_states = image_outputs.hidden_states
2391
+ vision_attentions = image_outputs.attentions
2393
2392
 
2394
2393
  # add no memory embedding to the last feature map
2395
2394
  feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding
@@ -536,9 +536,9 @@ class EdgeTamVideoRoPESelfAttention(nn.Module):
536
536
  # Apply rotary position encoding for self-attention
537
537
  query, key = apply_rotary_pos_emb_2d_self_attn(query, key, cos=cos, sin=sin)
538
538
 
539
- attention_interface: Callable = eager_attention_forward
540
- if self.config._attn_implementation != "eager":
541
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
539
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
540
+ self.config._attn_implementation, eager_attention_forward
541
+ )
542
542
 
543
543
  attn_output, attn_weights = attention_interface(
544
544
  self,
@@ -612,9 +612,9 @@ class EdgeTamVideoRoPECrossAttention(nn.Module):
612
612
  num_k_exclude_rope=num_k_exclude_rope,
613
613
  )
614
614
 
615
- attention_interface: Callable = eager_attention_forward
616
- if self.config._attn_implementation != "eager":
617
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
615
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
616
+ self.config._attn_implementation, eager_attention_forward
617
+ )
618
618
 
619
619
  attn_output, attn_weights = attention_interface(
620
620
  self,
@@ -854,9 +854,9 @@ class EdgeTamVideoPerceiverAttention(nn.Module):
854
854
  value = value + pos_encoding
855
855
 
856
856
  # Apply attention
857
- attention_interface: Callable = eager_attention_forward
858
- if self.config._attn_implementation != "eager":
859
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
857
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
858
+ self.config._attn_implementation, eager_attention_forward
859
+ )
860
860
 
861
861
  attn_output, _ = attention_interface(
862
862
  self,
@@ -7,8 +7,8 @@
7
7
  from typing import Optional
8
8
 
9
9
  import torch
10
+ import torchvision.transforms.v2.functional as tvF
10
11
  from PIL import Image, ImageDraw
11
- from torchvision.transforms.v2 import functional as F
12
12
 
13
13
  from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
14
14
  from ...image_transforms import group_images_by_shape, reorder_images
@@ -84,7 +84,7 @@ def convert_to_grayscale(
84
84
  """
85
85
  if is_grayscale(image):
86
86
  return image
87
- return F.rgb_to_grayscale(image, num_output_channels=3)
87
+ return tvF.rgb_to_grayscale(image, num_output_channels=3)
88
88
 
89
89
 
90
90
  @auto_docstring
@@ -111,6 +111,7 @@ class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
111
111
  **kwargs,
112
112
  ) -> ImageInput:
113
113
  # we need to handle image pairs validation and flattening
114
+ images = self.fetch_images(images)
114
115
  return flatten_pair_images(images)
115
116
 
116
117
  def _preprocess(
@@ -120,7 +121,7 @@ class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
120
121
  rescale_factor: float,
121
122
  do_rescale: bool,
122
123
  do_resize: bool,
123
- interpolation: Optional["F.InterpolationMode"],
124
+ interpolation: Optional["tvF.InterpolationMode"],
124
125
  do_grayscale: bool,
125
126
  disable_grouping: bool,
126
127
  return_tensors: str | TensorType,