transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1021) hide show
  1. transformers/__init__.py +4 -11
  2. transformers/activations.py +2 -2
  3. transformers/backbone_utils.py +326 -0
  4. transformers/cache_utils.py +11 -2
  5. transformers/cli/serve.py +11 -8
  6. transformers/configuration_utils.py +1 -69
  7. transformers/conversion_mapping.py +146 -26
  8. transformers/convert_slow_tokenizer.py +6 -4
  9. transformers/core_model_loading.py +207 -118
  10. transformers/dependency_versions_check.py +0 -1
  11. transformers/dependency_versions_table.py +7 -8
  12. transformers/file_utils.py +0 -2
  13. transformers/generation/candidate_generator.py +1 -2
  14. transformers/generation/continuous_batching/cache.py +40 -38
  15. transformers/generation/continuous_batching/cache_manager.py +3 -16
  16. transformers/generation/continuous_batching/continuous_api.py +94 -406
  17. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  18. transformers/generation/continuous_batching/requests.py +54 -17
  19. transformers/generation/continuous_batching/scheduler.py +77 -95
  20. transformers/generation/logits_process.py +10 -5
  21. transformers/generation/stopping_criteria.py +1 -2
  22. transformers/generation/utils.py +75 -95
  23. transformers/image_processing_utils.py +0 -3
  24. transformers/image_processing_utils_fast.py +17 -18
  25. transformers/image_transforms.py +44 -13
  26. transformers/image_utils.py +0 -5
  27. transformers/initialization.py +57 -0
  28. transformers/integrations/__init__.py +10 -24
  29. transformers/integrations/accelerate.py +47 -11
  30. transformers/integrations/deepspeed.py +145 -3
  31. transformers/integrations/executorch.py +2 -6
  32. transformers/integrations/finegrained_fp8.py +142 -7
  33. transformers/integrations/flash_attention.py +2 -7
  34. transformers/integrations/hub_kernels.py +18 -7
  35. transformers/integrations/moe.py +226 -106
  36. transformers/integrations/mxfp4.py +47 -34
  37. transformers/integrations/peft.py +488 -176
  38. transformers/integrations/tensor_parallel.py +641 -581
  39. transformers/masking_utils.py +153 -9
  40. transformers/modeling_flash_attention_utils.py +1 -2
  41. transformers/modeling_utils.py +359 -358
  42. transformers/models/__init__.py +6 -0
  43. transformers/models/afmoe/configuration_afmoe.py +14 -4
  44. transformers/models/afmoe/modeling_afmoe.py +8 -8
  45. transformers/models/afmoe/modular_afmoe.py +7 -7
  46. transformers/models/aimv2/configuration_aimv2.py +2 -7
  47. transformers/models/aimv2/modeling_aimv2.py +26 -24
  48. transformers/models/aimv2/modular_aimv2.py +8 -12
  49. transformers/models/albert/configuration_albert.py +8 -1
  50. transformers/models/albert/modeling_albert.py +3 -3
  51. transformers/models/align/configuration_align.py +8 -5
  52. transformers/models/align/modeling_align.py +22 -24
  53. transformers/models/altclip/configuration_altclip.py +4 -6
  54. transformers/models/altclip/modeling_altclip.py +30 -26
  55. transformers/models/apertus/configuration_apertus.py +5 -7
  56. transformers/models/apertus/modeling_apertus.py +4 -4
  57. transformers/models/apertus/modular_apertus.py +8 -10
  58. transformers/models/arcee/configuration_arcee.py +5 -7
  59. transformers/models/arcee/modeling_arcee.py +4 -4
  60. transformers/models/aria/configuration_aria.py +11 -21
  61. transformers/models/aria/modeling_aria.py +39 -36
  62. transformers/models/aria/modular_aria.py +33 -39
  63. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
  64. transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
  65. transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
  66. transformers/models/auto/auto_factory.py +8 -6
  67. transformers/models/auto/configuration_auto.py +22 -0
  68. transformers/models/auto/image_processing_auto.py +17 -13
  69. transformers/models/auto/modeling_auto.py +15 -0
  70. transformers/models/auto/processing_auto.py +9 -18
  71. transformers/models/auto/tokenization_auto.py +17 -15
  72. transformers/models/autoformer/modeling_autoformer.py +2 -1
  73. transformers/models/aya_vision/configuration_aya_vision.py +4 -0
  74. transformers/models/aya_vision/modeling_aya_vision.py +29 -62
  75. transformers/models/aya_vision/modular_aya_vision.py +20 -45
  76. transformers/models/bamba/configuration_bamba.py +17 -7
  77. transformers/models/bamba/modeling_bamba.py +23 -55
  78. transformers/models/bamba/modular_bamba.py +19 -54
  79. transformers/models/bark/configuration_bark.py +2 -1
  80. transformers/models/bark/modeling_bark.py +24 -10
  81. transformers/models/bart/configuration_bart.py +9 -4
  82. transformers/models/bart/modeling_bart.py +9 -12
  83. transformers/models/beit/configuration_beit.py +2 -4
  84. transformers/models/beit/image_processing_beit_fast.py +3 -3
  85. transformers/models/beit/modeling_beit.py +14 -9
  86. transformers/models/bert/configuration_bert.py +12 -1
  87. transformers/models/bert/modeling_bert.py +6 -30
  88. transformers/models/bert_generation/configuration_bert_generation.py +17 -1
  89. transformers/models/bert_generation/modeling_bert_generation.py +6 -6
  90. transformers/models/big_bird/configuration_big_bird.py +12 -8
  91. transformers/models/big_bird/modeling_big_bird.py +0 -15
  92. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
  93. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
  94. transformers/models/biogpt/configuration_biogpt.py +8 -1
  95. transformers/models/biogpt/modeling_biogpt.py +4 -8
  96. transformers/models/biogpt/modular_biogpt.py +1 -5
  97. transformers/models/bit/configuration_bit.py +2 -4
  98. transformers/models/bit/modeling_bit.py +6 -5
  99. transformers/models/bitnet/configuration_bitnet.py +5 -7
  100. transformers/models/bitnet/modeling_bitnet.py +3 -4
  101. transformers/models/bitnet/modular_bitnet.py +3 -4
  102. transformers/models/blenderbot/configuration_blenderbot.py +8 -4
  103. transformers/models/blenderbot/modeling_blenderbot.py +4 -4
  104. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
  105. transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
  106. transformers/models/blip/configuration_blip.py +9 -9
  107. transformers/models/blip/modeling_blip.py +55 -37
  108. transformers/models/blip_2/configuration_blip_2.py +2 -1
  109. transformers/models/blip_2/modeling_blip_2.py +81 -56
  110. transformers/models/bloom/configuration_bloom.py +5 -1
  111. transformers/models/bloom/modeling_bloom.py +2 -1
  112. transformers/models/blt/configuration_blt.py +23 -12
  113. transformers/models/blt/modeling_blt.py +20 -14
  114. transformers/models/blt/modular_blt.py +70 -10
  115. transformers/models/bridgetower/configuration_bridgetower.py +7 -1
  116. transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
  117. transformers/models/bridgetower/modeling_bridgetower.py +29 -15
  118. transformers/models/bros/configuration_bros.py +24 -17
  119. transformers/models/camembert/configuration_camembert.py +8 -1
  120. transformers/models/camembert/modeling_camembert.py +6 -6
  121. transformers/models/canine/configuration_canine.py +4 -1
  122. transformers/models/chameleon/configuration_chameleon.py +5 -7
  123. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
  124. transformers/models/chameleon/modeling_chameleon.py +82 -36
  125. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
  126. transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
  127. transformers/models/clap/configuration_clap.py +4 -8
  128. transformers/models/clap/modeling_clap.py +21 -22
  129. transformers/models/clip/configuration_clip.py +4 -1
  130. transformers/models/clip/image_processing_clip_fast.py +9 -0
  131. transformers/models/clip/modeling_clip.py +25 -22
  132. transformers/models/clipseg/configuration_clipseg.py +4 -1
  133. transformers/models/clipseg/modeling_clipseg.py +27 -25
  134. transformers/models/clipseg/processing_clipseg.py +11 -3
  135. transformers/models/clvp/configuration_clvp.py +14 -2
  136. transformers/models/clvp/modeling_clvp.py +19 -30
  137. transformers/models/codegen/configuration_codegen.py +4 -3
  138. transformers/models/codegen/modeling_codegen.py +2 -1
  139. transformers/models/cohere/configuration_cohere.py +5 -7
  140. transformers/models/cohere/modeling_cohere.py +4 -4
  141. transformers/models/cohere/modular_cohere.py +3 -3
  142. transformers/models/cohere2/configuration_cohere2.py +6 -8
  143. transformers/models/cohere2/modeling_cohere2.py +4 -4
  144. transformers/models/cohere2/modular_cohere2.py +9 -11
  145. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  146. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
  147. transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
  148. transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
  149. transformers/models/colqwen2/modeling_colqwen2.py +7 -6
  150. transformers/models/colqwen2/modular_colqwen2.py +7 -6
  151. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
  152. transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
  153. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
  154. transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
  155. transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
  156. transformers/models/convbert/configuration_convbert.py +11 -7
  157. transformers/models/convnext/configuration_convnext.py +2 -4
  158. transformers/models/convnext/image_processing_convnext_fast.py +2 -2
  159. transformers/models/convnext/modeling_convnext.py +7 -6
  160. transformers/models/convnextv2/configuration_convnextv2.py +2 -4
  161. transformers/models/convnextv2/modeling_convnextv2.py +7 -6
  162. transformers/models/cpmant/configuration_cpmant.py +4 -0
  163. transformers/models/csm/configuration_csm.py +9 -15
  164. transformers/models/csm/modeling_csm.py +3 -3
  165. transformers/models/ctrl/configuration_ctrl.py +16 -0
  166. transformers/models/ctrl/modeling_ctrl.py +13 -25
  167. transformers/models/cwm/configuration_cwm.py +5 -7
  168. transformers/models/cwm/modeling_cwm.py +4 -4
  169. transformers/models/d_fine/configuration_d_fine.py +10 -56
  170. transformers/models/d_fine/modeling_d_fine.py +728 -868
  171. transformers/models/d_fine/modular_d_fine.py +335 -412
  172. transformers/models/dab_detr/configuration_dab_detr.py +22 -48
  173. transformers/models/dab_detr/modeling_dab_detr.py +11 -7
  174. transformers/models/dac/modeling_dac.py +1 -1
  175. transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
  176. transformers/models/data2vec/configuration_data2vec_text.py +11 -2
  177. transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
  178. transformers/models/data2vec/modeling_data2vec_text.py +6 -6
  179. transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
  180. transformers/models/dbrx/configuration_dbrx.py +11 -3
  181. transformers/models/dbrx/modeling_dbrx.py +6 -6
  182. transformers/models/dbrx/modular_dbrx.py +6 -6
  183. transformers/models/deberta/configuration_deberta.py +6 -0
  184. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
  185. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
  186. transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
  187. transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
  188. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
  189. transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
  190. transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
  191. transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
  192. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
  193. transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
  194. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
  195. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
  196. transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
  197. transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
  198. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
  199. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
  200. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
  201. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
  202. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
  203. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
  204. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
  205. transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
  206. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
  207. transformers/models/deit/modeling_deit.py +11 -7
  208. transformers/models/depth_anything/configuration_depth_anything.py +12 -42
  209. transformers/models/depth_anything/modeling_depth_anything.py +5 -3
  210. transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
  211. transformers/models/depth_pro/modeling_depth_pro.py +8 -4
  212. transformers/models/detr/configuration_detr.py +18 -49
  213. transformers/models/detr/image_processing_detr_fast.py +11 -11
  214. transformers/models/detr/modeling_detr.py +695 -734
  215. transformers/models/dia/configuration_dia.py +4 -7
  216. transformers/models/dia/generation_dia.py +8 -17
  217. transformers/models/dia/modeling_dia.py +7 -7
  218. transformers/models/dia/modular_dia.py +4 -4
  219. transformers/models/diffllama/configuration_diffllama.py +5 -7
  220. transformers/models/diffllama/modeling_diffllama.py +3 -8
  221. transformers/models/diffllama/modular_diffllama.py +2 -7
  222. transformers/models/dinat/configuration_dinat.py +2 -4
  223. transformers/models/dinat/modeling_dinat.py +7 -6
  224. transformers/models/dinov2/configuration_dinov2.py +2 -4
  225. transformers/models/dinov2/modeling_dinov2.py +9 -8
  226. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
  227. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
  228. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
  229. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
  230. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
  231. transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
  232. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
  233. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
  234. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
  235. transformers/models/distilbert/configuration_distilbert.py +8 -1
  236. transformers/models/distilbert/modeling_distilbert.py +3 -3
  237. transformers/models/doge/configuration_doge.py +17 -7
  238. transformers/models/doge/modeling_doge.py +4 -4
  239. transformers/models/doge/modular_doge.py +20 -10
  240. transformers/models/donut/image_processing_donut_fast.py +4 -4
  241. transformers/models/dots1/configuration_dots1.py +16 -7
  242. transformers/models/dots1/modeling_dots1.py +4 -4
  243. transformers/models/dpr/configuration_dpr.py +19 -1
  244. transformers/models/dpt/configuration_dpt.py +23 -65
  245. transformers/models/dpt/image_processing_dpt_fast.py +5 -5
  246. transformers/models/dpt/modeling_dpt.py +19 -15
  247. transformers/models/dpt/modular_dpt.py +4 -4
  248. transformers/models/edgetam/configuration_edgetam.py +1 -1
  249. transformers/models/edgetam/modeling_edgetam.py +53 -53
  250. transformers/models/edgetam/modular_edgetam.py +5 -7
  251. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
  252. transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
  253. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
  254. transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
  255. transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
  256. transformers/models/electra/configuration_electra.py +13 -2
  257. transformers/models/electra/modeling_electra.py +6 -6
  258. transformers/models/emu3/configuration_emu3.py +12 -10
  259. transformers/models/emu3/modeling_emu3.py +84 -47
  260. transformers/models/emu3/modular_emu3.py +77 -39
  261. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
  262. transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
  263. transformers/models/eomt/configuration_eomt.py +12 -13
  264. transformers/models/eomt/image_processing_eomt_fast.py +3 -3
  265. transformers/models/eomt/modeling_eomt.py +3 -3
  266. transformers/models/eomt/modular_eomt.py +17 -17
  267. transformers/models/eomt_dinov3/__init__.py +28 -0
  268. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  269. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  270. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  271. transformers/models/ernie/configuration_ernie.py +24 -2
  272. transformers/models/ernie/modeling_ernie.py +6 -30
  273. transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
  274. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  275. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
  276. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
  277. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
  278. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
  279. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
  280. transformers/models/esm/configuration_esm.py +9 -11
  281. transformers/models/esm/modeling_esm.py +3 -3
  282. transformers/models/esm/modeling_esmfold.py +1 -6
  283. transformers/models/esm/openfold_utils/protein.py +2 -3
  284. transformers/models/evolla/configuration_evolla.py +21 -8
  285. transformers/models/evolla/modeling_evolla.py +11 -7
  286. transformers/models/evolla/modular_evolla.py +5 -1
  287. transformers/models/exaone4/configuration_exaone4.py +8 -5
  288. transformers/models/exaone4/modeling_exaone4.py +4 -4
  289. transformers/models/exaone4/modular_exaone4.py +11 -8
  290. transformers/models/exaone_moe/__init__.py +27 -0
  291. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  292. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  293. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  294. transformers/models/falcon/configuration_falcon.py +9 -1
  295. transformers/models/falcon/modeling_falcon.py +3 -8
  296. transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
  297. transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
  298. transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
  299. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
  300. transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
  301. transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
  302. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
  303. transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
  304. transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
  305. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
  306. transformers/models/flaubert/configuration_flaubert.py +10 -4
  307. transformers/models/flaubert/modeling_flaubert.py +1 -1
  308. transformers/models/flava/configuration_flava.py +4 -3
  309. transformers/models/flava/image_processing_flava_fast.py +4 -4
  310. transformers/models/flava/modeling_flava.py +36 -28
  311. transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
  312. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
  313. transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
  314. transformers/models/florence2/configuration_florence2.py +4 -0
  315. transformers/models/florence2/modeling_florence2.py +57 -32
  316. transformers/models/florence2/modular_florence2.py +48 -26
  317. transformers/models/fnet/configuration_fnet.py +6 -1
  318. transformers/models/focalnet/configuration_focalnet.py +2 -4
  319. transformers/models/focalnet/modeling_focalnet.py +10 -7
  320. transformers/models/fsmt/configuration_fsmt.py +12 -16
  321. transformers/models/funnel/configuration_funnel.py +8 -0
  322. transformers/models/fuyu/configuration_fuyu.py +5 -8
  323. transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
  324. transformers/models/fuyu/modeling_fuyu.py +24 -23
  325. transformers/models/gemma/configuration_gemma.py +5 -7
  326. transformers/models/gemma/modeling_gemma.py +4 -4
  327. transformers/models/gemma/modular_gemma.py +5 -7
  328. transformers/models/gemma2/configuration_gemma2.py +5 -7
  329. transformers/models/gemma2/modeling_gemma2.py +4 -4
  330. transformers/models/gemma2/modular_gemma2.py +8 -10
  331. transformers/models/gemma3/configuration_gemma3.py +28 -22
  332. transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
  333. transformers/models/gemma3/modeling_gemma3.py +37 -33
  334. transformers/models/gemma3/modular_gemma3.py +46 -42
  335. transformers/models/gemma3n/configuration_gemma3n.py +35 -22
  336. transformers/models/gemma3n/modeling_gemma3n.py +86 -58
  337. transformers/models/gemma3n/modular_gemma3n.py +112 -75
  338. transformers/models/git/configuration_git.py +5 -7
  339. transformers/models/git/modeling_git.py +31 -41
  340. transformers/models/glm/configuration_glm.py +7 -9
  341. transformers/models/glm/modeling_glm.py +4 -4
  342. transformers/models/glm4/configuration_glm4.py +7 -9
  343. transformers/models/glm4/modeling_glm4.py +4 -4
  344. transformers/models/glm46v/configuration_glm46v.py +4 -0
  345. transformers/models/glm46v/image_processing_glm46v.py +5 -2
  346. transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
  347. transformers/models/glm46v/modeling_glm46v.py +91 -46
  348. transformers/models/glm46v/modular_glm46v.py +4 -0
  349. transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
  350. transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
  351. transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
  352. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
  353. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
  354. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
  355. transformers/models/glm4v/configuration_glm4v.py +12 -8
  356. transformers/models/glm4v/image_processing_glm4v.py +5 -2
  357. transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
  358. transformers/models/glm4v/modeling_glm4v.py +120 -63
  359. transformers/models/glm4v/modular_glm4v.py +82 -50
  360. transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
  361. transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
  362. transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
  363. transformers/models/glm_image/configuration_glm_image.py +26 -20
  364. transformers/models/glm_image/image_processing_glm_image.py +1 -1
  365. transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
  366. transformers/models/glm_image/modeling_glm_image.py +337 -236
  367. transformers/models/glm_image/modular_glm_image.py +415 -255
  368. transformers/models/glm_image/processing_glm_image.py +65 -17
  369. transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
  370. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  371. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  372. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  373. transformers/models/glmasr/modeling_glmasr.py +34 -28
  374. transformers/models/glmasr/modular_glmasr.py +23 -11
  375. transformers/models/glpn/image_processing_glpn_fast.py +3 -3
  376. transformers/models/glpn/modeling_glpn.py +4 -2
  377. transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
  378. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
  379. transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
  380. transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
  381. transformers/models/gpt2/configuration_gpt2.py +13 -1
  382. transformers/models/gpt2/modeling_gpt2.py +5 -5
  383. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
  384. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
  385. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
  386. transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
  387. transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
  388. transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
  389. transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
  390. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
  391. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
  392. transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
  393. transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
  394. transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
  395. transformers/models/gptj/configuration_gptj.py +4 -4
  396. transformers/models/gptj/modeling_gptj.py +3 -7
  397. transformers/models/granite/configuration_granite.py +5 -7
  398. transformers/models/granite/modeling_granite.py +4 -4
  399. transformers/models/granite_speech/modeling_granite_speech.py +63 -37
  400. transformers/models/granitemoe/configuration_granitemoe.py +5 -7
  401. transformers/models/granitemoe/modeling_granitemoe.py +4 -4
  402. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
  403. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
  404. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
  405. transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
  406. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
  407. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
  408. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
  409. transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
  410. transformers/models/groupvit/configuration_groupvit.py +4 -1
  411. transformers/models/groupvit/modeling_groupvit.py +29 -22
  412. transformers/models/helium/configuration_helium.py +5 -7
  413. transformers/models/helium/modeling_helium.py +4 -4
  414. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
  415. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
  416. transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
  417. transformers/models/hiera/configuration_hiera.py +2 -4
  418. transformers/models/hiera/modeling_hiera.py +11 -8
  419. transformers/models/hubert/configuration_hubert.py +4 -1
  420. transformers/models/hubert/modeling_hubert.py +7 -4
  421. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
  422. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
  423. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
  424. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
  425. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
  426. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
  427. transformers/models/ibert/configuration_ibert.py +4 -1
  428. transformers/models/idefics/configuration_idefics.py +5 -7
  429. transformers/models/idefics/modeling_idefics.py +3 -4
  430. transformers/models/idefics/vision.py +5 -4
  431. transformers/models/idefics2/configuration_idefics2.py +1 -2
  432. transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
  433. transformers/models/idefics2/modeling_idefics2.py +72 -50
  434. transformers/models/idefics3/configuration_idefics3.py +1 -3
  435. transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
  436. transformers/models/idefics3/modeling_idefics3.py +63 -40
  437. transformers/models/ijepa/modeling_ijepa.py +3 -3
  438. transformers/models/imagegpt/configuration_imagegpt.py +9 -1
  439. transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
  440. transformers/models/imagegpt/modeling_imagegpt.py +8 -4
  441. transformers/models/informer/modeling_informer.py +3 -3
  442. transformers/models/instructblip/configuration_instructblip.py +2 -1
  443. transformers/models/instructblip/modeling_instructblip.py +65 -39
  444. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
  445. transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
  446. transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
  447. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
  448. transformers/models/internvl/configuration_internvl.py +5 -0
  449. transformers/models/internvl/modeling_internvl.py +35 -55
  450. transformers/models/internvl/modular_internvl.py +26 -38
  451. transformers/models/internvl/video_processing_internvl.py +2 -2
  452. transformers/models/jais2/configuration_jais2.py +5 -7
  453. transformers/models/jais2/modeling_jais2.py +4 -4
  454. transformers/models/jamba/configuration_jamba.py +5 -7
  455. transformers/models/jamba/modeling_jamba.py +4 -4
  456. transformers/models/jamba/modular_jamba.py +3 -3
  457. transformers/models/janus/image_processing_janus.py +2 -2
  458. transformers/models/janus/image_processing_janus_fast.py +8 -8
  459. transformers/models/janus/modeling_janus.py +63 -146
  460. transformers/models/janus/modular_janus.py +62 -20
  461. transformers/models/jetmoe/configuration_jetmoe.py +6 -4
  462. transformers/models/jetmoe/modeling_jetmoe.py +3 -3
  463. transformers/models/jetmoe/modular_jetmoe.py +3 -3
  464. transformers/models/kosmos2/configuration_kosmos2.py +10 -8
  465. transformers/models/kosmos2/modeling_kosmos2.py +56 -34
  466. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
  467. transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
  468. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
  469. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
  470. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
  471. transformers/models/lasr/configuration_lasr.py +2 -4
  472. transformers/models/lasr/modeling_lasr.py +3 -3
  473. transformers/models/lasr/modular_lasr.py +3 -3
  474. transformers/models/layoutlm/configuration_layoutlm.py +14 -1
  475. transformers/models/layoutlm/modeling_layoutlm.py +3 -3
  476. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
  477. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
  478. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
  479. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
  480. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
  481. transformers/models/led/configuration_led.py +7 -8
  482. transformers/models/levit/image_processing_levit_fast.py +4 -4
  483. transformers/models/lfm2/configuration_lfm2.py +5 -7
  484. transformers/models/lfm2/modeling_lfm2.py +4 -4
  485. transformers/models/lfm2/modular_lfm2.py +3 -3
  486. transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
  487. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
  488. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  489. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
  490. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
  491. transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
  492. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
  493. transformers/models/lightglue/modeling_lightglue.py +3 -3
  494. transformers/models/lightglue/modular_lightglue.py +3 -3
  495. transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
  496. transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
  497. transformers/models/lilt/configuration_lilt.py +6 -1
  498. transformers/models/llama/configuration_llama.py +5 -7
  499. transformers/models/llama/modeling_llama.py +4 -4
  500. transformers/models/llama4/configuration_llama4.py +67 -47
  501. transformers/models/llama4/image_processing_llama4_fast.py +3 -3
  502. transformers/models/llama4/modeling_llama4.py +46 -44
  503. transformers/models/llava/configuration_llava.py +10 -0
  504. transformers/models/llava/image_processing_llava_fast.py +3 -3
  505. transformers/models/llava/modeling_llava.py +38 -65
  506. transformers/models/llava_next/configuration_llava_next.py +2 -1
  507. transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
  508. transformers/models/llava_next/modeling_llava_next.py +61 -60
  509. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
  510. transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
  511. transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
  512. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
  513. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
  514. transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
  515. transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
  516. transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
  517. transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
  518. transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
  519. transformers/models/longformer/configuration_longformer.py +4 -1
  520. transformers/models/longt5/configuration_longt5.py +9 -6
  521. transformers/models/longt5/modeling_longt5.py +2 -1
  522. transformers/models/luke/configuration_luke.py +8 -1
  523. transformers/models/lw_detr/configuration_lw_detr.py +19 -31
  524. transformers/models/lw_detr/modeling_lw_detr.py +43 -44
  525. transformers/models/lw_detr/modular_lw_detr.py +36 -38
  526. transformers/models/lxmert/configuration_lxmert.py +16 -0
  527. transformers/models/m2m_100/configuration_m2m_100.py +7 -8
  528. transformers/models/m2m_100/modeling_m2m_100.py +3 -3
  529. transformers/models/mamba/configuration_mamba.py +5 -2
  530. transformers/models/mamba/modeling_mamba.py +18 -26
  531. transformers/models/mamba2/configuration_mamba2.py +5 -7
  532. transformers/models/mamba2/modeling_mamba2.py +22 -33
  533. transformers/models/marian/configuration_marian.py +10 -4
  534. transformers/models/marian/modeling_marian.py +4 -4
  535. transformers/models/markuplm/configuration_markuplm.py +4 -6
  536. transformers/models/markuplm/modeling_markuplm.py +3 -3
  537. transformers/models/mask2former/configuration_mask2former.py +12 -47
  538. transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
  539. transformers/models/mask2former/modeling_mask2former.py +18 -12
  540. transformers/models/maskformer/configuration_maskformer.py +14 -45
  541. transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
  542. transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
  543. transformers/models/maskformer/modeling_maskformer.py +15 -9
  544. transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
  545. transformers/models/mbart/configuration_mbart.py +9 -4
  546. transformers/models/mbart/modeling_mbart.py +9 -6
  547. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
  548. transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
  549. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  550. transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
  551. transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
  552. transformers/models/mgp_str/modeling_mgp_str.py +4 -2
  553. transformers/models/mimi/configuration_mimi.py +4 -0
  554. transformers/models/mimi/modeling_mimi.py +40 -36
  555. transformers/models/minimax/configuration_minimax.py +8 -11
  556. transformers/models/minimax/modeling_minimax.py +5 -5
  557. transformers/models/minimax/modular_minimax.py +9 -12
  558. transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
  559. transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
  560. transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
  561. transformers/models/ministral/configuration_ministral.py +5 -7
  562. transformers/models/ministral/modeling_ministral.py +4 -4
  563. transformers/models/ministral/modular_ministral.py +5 -8
  564. transformers/models/ministral3/configuration_ministral3.py +4 -4
  565. transformers/models/ministral3/modeling_ministral3.py +4 -4
  566. transformers/models/ministral3/modular_ministral3.py +3 -3
  567. transformers/models/mistral/configuration_mistral.py +5 -7
  568. transformers/models/mistral/modeling_mistral.py +4 -4
  569. transformers/models/mistral/modular_mistral.py +3 -3
  570. transformers/models/mistral3/configuration_mistral3.py +4 -0
  571. transformers/models/mistral3/modeling_mistral3.py +36 -40
  572. transformers/models/mistral3/modular_mistral3.py +31 -32
  573. transformers/models/mixtral/configuration_mixtral.py +8 -11
  574. transformers/models/mixtral/modeling_mixtral.py +4 -4
  575. transformers/models/mlcd/modeling_mlcd.py +7 -5
  576. transformers/models/mlcd/modular_mlcd.py +7 -5
  577. transformers/models/mllama/configuration_mllama.py +5 -7
  578. transformers/models/mllama/image_processing_mllama_fast.py +6 -5
  579. transformers/models/mllama/modeling_mllama.py +19 -19
  580. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
  581. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
  582. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
  583. transformers/models/mobilebert/configuration_mobilebert.py +4 -1
  584. transformers/models/mobilebert/modeling_mobilebert.py +3 -3
  585. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
  586. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
  587. transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
  588. transformers/models/mobilevit/modeling_mobilevit.py +4 -2
  589. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
  590. transformers/models/modernbert/configuration_modernbert.py +46 -21
  591. transformers/models/modernbert/modeling_modernbert.py +146 -899
  592. transformers/models/modernbert/modular_modernbert.py +185 -908
  593. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
  594. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
  595. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
  596. transformers/models/moonshine/configuration_moonshine.py +12 -7
  597. transformers/models/moonshine/modeling_moonshine.py +7 -7
  598. transformers/models/moonshine/modular_moonshine.py +19 -13
  599. transformers/models/moshi/configuration_moshi.py +28 -2
  600. transformers/models/moshi/modeling_moshi.py +4 -9
  601. transformers/models/mpnet/configuration_mpnet.py +6 -1
  602. transformers/models/mpt/configuration_mpt.py +16 -0
  603. transformers/models/mra/configuration_mra.py +8 -1
  604. transformers/models/mt5/configuration_mt5.py +9 -5
  605. transformers/models/mt5/modeling_mt5.py +5 -8
  606. transformers/models/musicgen/configuration_musicgen.py +12 -7
  607. transformers/models/musicgen/modeling_musicgen.py +6 -5
  608. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
  609. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
  610. transformers/models/mvp/configuration_mvp.py +8 -4
  611. transformers/models/mvp/modeling_mvp.py +6 -4
  612. transformers/models/nanochat/configuration_nanochat.py +5 -7
  613. transformers/models/nanochat/modeling_nanochat.py +4 -4
  614. transformers/models/nanochat/modular_nanochat.py +4 -4
  615. transformers/models/nemotron/configuration_nemotron.py +5 -7
  616. transformers/models/nemotron/modeling_nemotron.py +4 -14
  617. transformers/models/nllb/tokenization_nllb.py +7 -5
  618. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
  619. transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
  620. transformers/models/nougat/image_processing_nougat_fast.py +8 -8
  621. transformers/models/nystromformer/configuration_nystromformer.py +8 -1
  622. transformers/models/olmo/configuration_olmo.py +5 -7
  623. transformers/models/olmo/modeling_olmo.py +4 -4
  624. transformers/models/olmo/modular_olmo.py +3 -3
  625. transformers/models/olmo2/configuration_olmo2.py +9 -11
  626. transformers/models/olmo2/modeling_olmo2.py +4 -4
  627. transformers/models/olmo2/modular_olmo2.py +7 -7
  628. transformers/models/olmo3/configuration_olmo3.py +10 -11
  629. transformers/models/olmo3/modeling_olmo3.py +4 -4
  630. transformers/models/olmo3/modular_olmo3.py +13 -14
  631. transformers/models/olmoe/configuration_olmoe.py +5 -7
  632. transformers/models/olmoe/modeling_olmoe.py +4 -4
  633. transformers/models/olmoe/modular_olmoe.py +3 -3
  634. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
  635. transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
  636. transformers/models/oneformer/configuration_oneformer.py +9 -46
  637. transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
  638. transformers/models/oneformer/modeling_oneformer.py +14 -9
  639. transformers/models/openai/configuration_openai.py +16 -0
  640. transformers/models/opt/configuration_opt.py +6 -6
  641. transformers/models/opt/modeling_opt.py +5 -5
  642. transformers/models/ovis2/configuration_ovis2.py +4 -0
  643. transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
  644. transformers/models/ovis2/modeling_ovis2.py +58 -99
  645. transformers/models/ovis2/modular_ovis2.py +52 -13
  646. transformers/models/owlv2/configuration_owlv2.py +4 -1
  647. transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
  648. transformers/models/owlv2/modeling_owlv2.py +40 -27
  649. transformers/models/owlv2/modular_owlv2.py +5 -5
  650. transformers/models/owlvit/configuration_owlvit.py +4 -1
  651. transformers/models/owlvit/modeling_owlvit.py +40 -27
  652. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
  653. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
  654. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
  655. transformers/models/paligemma/configuration_paligemma.py +4 -0
  656. transformers/models/paligemma/modeling_paligemma.py +30 -26
  657. transformers/models/parakeet/configuration_parakeet.py +2 -4
  658. transformers/models/parakeet/modeling_parakeet.py +3 -3
  659. transformers/models/parakeet/modular_parakeet.py +3 -3
  660. transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
  661. transformers/models/patchtst/modeling_patchtst.py +3 -3
  662. transformers/models/pe_audio/modeling_pe_audio.py +4 -4
  663. transformers/models/pe_audio/modular_pe_audio.py +1 -1
  664. transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
  665. transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
  666. transformers/models/pe_video/modeling_pe_video.py +36 -24
  667. transformers/models/pe_video/modular_pe_video.py +36 -23
  668. transformers/models/pegasus/configuration_pegasus.py +8 -5
  669. transformers/models/pegasus/modeling_pegasus.py +4 -4
  670. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
  671. transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
  672. transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
  673. transformers/models/perceiver/modeling_perceiver.py +17 -9
  674. transformers/models/perception_lm/modeling_perception_lm.py +26 -27
  675. transformers/models/perception_lm/modular_perception_lm.py +27 -25
  676. transformers/models/persimmon/configuration_persimmon.py +5 -7
  677. transformers/models/persimmon/modeling_persimmon.py +5 -5
  678. transformers/models/phi/configuration_phi.py +8 -6
  679. transformers/models/phi/modeling_phi.py +4 -4
  680. transformers/models/phi/modular_phi.py +3 -3
  681. transformers/models/phi3/configuration_phi3.py +9 -11
  682. transformers/models/phi3/modeling_phi3.py +4 -4
  683. transformers/models/phi3/modular_phi3.py +3 -3
  684. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
  685. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
  686. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
  687. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
  688. transformers/models/phimoe/configuration_phimoe.py +5 -7
  689. transformers/models/phimoe/modeling_phimoe.py +15 -39
  690. transformers/models/phimoe/modular_phimoe.py +12 -7
  691. transformers/models/pix2struct/configuration_pix2struct.py +12 -9
  692. transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
  693. transformers/models/pix2struct/modeling_pix2struct.py +14 -7
  694. transformers/models/pixio/configuration_pixio.py +2 -4
  695. transformers/models/pixio/modeling_pixio.py +9 -8
  696. transformers/models/pixio/modular_pixio.py +4 -2
  697. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
  698. transformers/models/pixtral/modeling_pixtral.py +9 -12
  699. transformers/models/plbart/configuration_plbart.py +8 -5
  700. transformers/models/plbart/modeling_plbart.py +9 -7
  701. transformers/models/plbart/modular_plbart.py +1 -1
  702. transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
  703. transformers/models/pop2piano/configuration_pop2piano.py +7 -6
  704. transformers/models/pop2piano/modeling_pop2piano.py +2 -1
  705. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  706. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  707. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  708. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  709. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  710. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
  711. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
  712. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
  713. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
  714. transformers/models/prophetnet/configuration_prophetnet.py +11 -10
  715. transformers/models/prophetnet/modeling_prophetnet.py +12 -23
  716. transformers/models/pvt/image_processing_pvt.py +7 -7
  717. transformers/models/pvt/image_processing_pvt_fast.py +1 -1
  718. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  719. transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
  720. transformers/models/qwen2/configuration_qwen2.py +14 -4
  721. transformers/models/qwen2/modeling_qwen2.py +4 -4
  722. transformers/models/qwen2/modular_qwen2.py +3 -3
  723. transformers/models/qwen2/tokenization_qwen2.py +0 -4
  724. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
  725. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
  726. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
  727. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
  728. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
  729. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
  730. transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
  731. transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
  732. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  733. transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
  734. transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
  735. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
  736. transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
  737. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
  738. transformers/models/qwen3/configuration_qwen3.py +15 -5
  739. transformers/models/qwen3/modeling_qwen3.py +4 -4
  740. transformers/models/qwen3/modular_qwen3.py +3 -3
  741. transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
  742. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  743. transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
  744. transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
  745. transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
  746. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
  747. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
  748. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
  749. transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
  750. transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
  751. transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
  752. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
  753. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
  754. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
  755. transformers/models/rag/configuration_rag.py +6 -6
  756. transformers/models/rag/modeling_rag.py +3 -3
  757. transformers/models/rag/retrieval_rag.py +1 -1
  758. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
  759. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
  760. transformers/models/reformer/configuration_reformer.py +7 -7
  761. transformers/models/rembert/configuration_rembert.py +8 -1
  762. transformers/models/rembert/modeling_rembert.py +0 -22
  763. transformers/models/resnet/configuration_resnet.py +2 -4
  764. transformers/models/resnet/modeling_resnet.py +6 -5
  765. transformers/models/roberta/configuration_roberta.py +11 -2
  766. transformers/models/roberta/modeling_roberta.py +6 -6
  767. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
  768. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
  769. transformers/models/roc_bert/configuration_roc_bert.py +8 -1
  770. transformers/models/roc_bert/modeling_roc_bert.py +6 -41
  771. transformers/models/roformer/configuration_roformer.py +13 -2
  772. transformers/models/roformer/modeling_roformer.py +0 -14
  773. transformers/models/rt_detr/configuration_rt_detr.py +8 -49
  774. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
  775. transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
  776. transformers/models/rt_detr/modeling_rt_detr.py +578 -737
  777. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
  778. transformers/models/rt_detr/modular_rt_detr.py +1508 -6
  779. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
  780. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
  781. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
  782. transformers/models/rwkv/configuration_rwkv.py +2 -3
  783. transformers/models/rwkv/modeling_rwkv.py +0 -23
  784. transformers/models/sam/configuration_sam.py +2 -0
  785. transformers/models/sam/image_processing_sam_fast.py +4 -4
  786. transformers/models/sam/modeling_sam.py +13 -8
  787. transformers/models/sam/processing_sam.py +3 -3
  788. transformers/models/sam2/configuration_sam2.py +1 -1
  789. transformers/models/sam2/modeling_sam2.py +56 -52
  790. transformers/models/sam2/modular_sam2.py +47 -55
  791. transformers/models/sam2_video/modeling_sam2_video.py +50 -51
  792. transformers/models/sam2_video/modular_sam2_video.py +12 -10
  793. transformers/models/sam3/modeling_sam3.py +43 -47
  794. transformers/models/sam3/processing_sam3.py +8 -4
  795. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
  796. transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
  797. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  798. transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
  799. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
  800. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
  801. transformers/models/sam3_video/modeling_sam3_video.py +27 -14
  802. transformers/models/sam_hq/configuration_sam_hq.py +2 -0
  803. transformers/models/sam_hq/modeling_sam_hq.py +13 -9
  804. transformers/models/sam_hq/modular_sam_hq.py +6 -6
  805. transformers/models/sam_hq/processing_sam_hq.py +7 -6
  806. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
  807. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
  808. transformers/models/seed_oss/configuration_seed_oss.py +7 -9
  809. transformers/models/seed_oss/modeling_seed_oss.py +4 -4
  810. transformers/models/seed_oss/modular_seed_oss.py +3 -3
  811. transformers/models/segformer/image_processing_segformer_fast.py +4 -4
  812. transformers/models/segformer/modeling_segformer.py +4 -2
  813. transformers/models/segformer/modular_segformer.py +3 -3
  814. transformers/models/seggpt/modeling_seggpt.py +20 -8
  815. transformers/models/sew/configuration_sew.py +4 -1
  816. transformers/models/sew/modeling_sew.py +9 -5
  817. transformers/models/sew/modular_sew.py +2 -1
  818. transformers/models/sew_d/configuration_sew_d.py +4 -1
  819. transformers/models/sew_d/modeling_sew_d.py +4 -1
  820. transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
  821. transformers/models/siglip/configuration_siglip.py +4 -1
  822. transformers/models/siglip/modeling_siglip.py +27 -71
  823. transformers/models/siglip2/__init__.py +1 -0
  824. transformers/models/siglip2/configuration_siglip2.py +4 -2
  825. transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
  826. transformers/models/siglip2/modeling_siglip2.py +37 -78
  827. transformers/models/siglip2/modular_siglip2.py +74 -25
  828. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  829. transformers/models/smollm3/configuration_smollm3.py +6 -6
  830. transformers/models/smollm3/modeling_smollm3.py +4 -4
  831. transformers/models/smollm3/modular_smollm3.py +9 -9
  832. transformers/models/smolvlm/configuration_smolvlm.py +1 -3
  833. transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
  834. transformers/models/smolvlm/modeling_smolvlm.py +75 -46
  835. transformers/models/smolvlm/modular_smolvlm.py +36 -23
  836. transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
  837. transformers/models/solar_open/__init__.py +27 -0
  838. transformers/models/solar_open/configuration_solar_open.py +184 -0
  839. transformers/models/solar_open/modeling_solar_open.py +642 -0
  840. transformers/models/solar_open/modular_solar_open.py +224 -0
  841. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
  842. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
  843. transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
  844. transformers/models/speecht5/configuration_speecht5.py +7 -8
  845. transformers/models/splinter/configuration_splinter.py +6 -6
  846. transformers/models/splinter/modeling_splinter.py +8 -3
  847. transformers/models/squeezebert/configuration_squeezebert.py +14 -1
  848. transformers/models/stablelm/configuration_stablelm.py +8 -6
  849. transformers/models/stablelm/modeling_stablelm.py +5 -5
  850. transformers/models/starcoder2/configuration_starcoder2.py +11 -5
  851. transformers/models/starcoder2/modeling_starcoder2.py +5 -5
  852. transformers/models/starcoder2/modular_starcoder2.py +4 -4
  853. transformers/models/superglue/configuration_superglue.py +4 -0
  854. transformers/models/superglue/image_processing_superglue_fast.py +4 -3
  855. transformers/models/superglue/modeling_superglue.py +9 -4
  856. transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
  857. transformers/models/superpoint/modeling_superpoint.py +4 -2
  858. transformers/models/swin/configuration_swin.py +2 -4
  859. transformers/models/swin/modeling_swin.py +11 -8
  860. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
  861. transformers/models/swin2sr/modeling_swin2sr.py +4 -2
  862. transformers/models/swinv2/configuration_swinv2.py +2 -4
  863. transformers/models/swinv2/modeling_swinv2.py +10 -7
  864. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
  865. transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
  866. transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
  867. transformers/models/t5/configuration_t5.py +9 -8
  868. transformers/models/t5/modeling_t5.py +5 -8
  869. transformers/models/t5gemma/configuration_t5gemma.py +10 -25
  870. transformers/models/t5gemma/modeling_t5gemma.py +9 -9
  871. transformers/models/t5gemma/modular_t5gemma.py +11 -24
  872. transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
  873. transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
  874. transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
  875. transformers/models/table_transformer/configuration_table_transformer.py +18 -49
  876. transformers/models/table_transformer/modeling_table_transformer.py +27 -53
  877. transformers/models/tapas/configuration_tapas.py +12 -1
  878. transformers/models/tapas/modeling_tapas.py +1 -1
  879. transformers/models/tapas/tokenization_tapas.py +1 -0
  880. transformers/models/textnet/configuration_textnet.py +4 -6
  881. transformers/models/textnet/image_processing_textnet_fast.py +3 -3
  882. transformers/models/textnet/modeling_textnet.py +15 -14
  883. transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
  884. transformers/models/timesfm/modeling_timesfm.py +5 -6
  885. transformers/models/timesfm/modular_timesfm.py +5 -6
  886. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
  887. transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
  888. transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
  889. transformers/models/trocr/configuration_trocr.py +11 -7
  890. transformers/models/trocr/modeling_trocr.py +4 -2
  891. transformers/models/tvp/configuration_tvp.py +10 -35
  892. transformers/models/tvp/image_processing_tvp_fast.py +6 -5
  893. transformers/models/tvp/modeling_tvp.py +1 -1
  894. transformers/models/udop/configuration_udop.py +16 -7
  895. transformers/models/udop/modeling_udop.py +10 -6
  896. transformers/models/umt5/configuration_umt5.py +8 -6
  897. transformers/models/umt5/modeling_umt5.py +7 -3
  898. transformers/models/unispeech/configuration_unispeech.py +4 -1
  899. transformers/models/unispeech/modeling_unispeech.py +7 -4
  900. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
  901. transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
  902. transformers/models/upernet/configuration_upernet.py +8 -35
  903. transformers/models/upernet/modeling_upernet.py +1 -1
  904. transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
  905. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  906. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  907. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
  908. transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
  909. transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
  910. transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
  911. transformers/models/video_llava/configuration_video_llava.py +4 -0
  912. transformers/models/video_llava/modeling_video_llava.py +87 -89
  913. transformers/models/videomae/modeling_videomae.py +4 -5
  914. transformers/models/vilt/configuration_vilt.py +4 -1
  915. transformers/models/vilt/image_processing_vilt_fast.py +6 -6
  916. transformers/models/vilt/modeling_vilt.py +27 -12
  917. transformers/models/vipllava/configuration_vipllava.py +4 -0
  918. transformers/models/vipllava/modeling_vipllava.py +57 -31
  919. transformers/models/vipllava/modular_vipllava.py +50 -24
  920. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
  921. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
  922. transformers/models/visual_bert/configuration_visual_bert.py +6 -1
  923. transformers/models/vit/configuration_vit.py +2 -2
  924. transformers/models/vit/modeling_vit.py +7 -5
  925. transformers/models/vit_mae/modeling_vit_mae.py +11 -7
  926. transformers/models/vit_msn/modeling_vit_msn.py +11 -7
  927. transformers/models/vitdet/configuration_vitdet.py +2 -4
  928. transformers/models/vitdet/modeling_vitdet.py +2 -3
  929. transformers/models/vitmatte/configuration_vitmatte.py +6 -35
  930. transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
  931. transformers/models/vitmatte/modeling_vitmatte.py +1 -1
  932. transformers/models/vitpose/configuration_vitpose.py +6 -43
  933. transformers/models/vitpose/modeling_vitpose.py +5 -3
  934. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
  935. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
  936. transformers/models/vits/configuration_vits.py +4 -0
  937. transformers/models/vits/modeling_vits.py +9 -7
  938. transformers/models/vivit/modeling_vivit.py +4 -4
  939. transformers/models/vjepa2/modeling_vjepa2.py +9 -9
  940. transformers/models/voxtral/configuration_voxtral.py +0 -1
  941. transformers/models/voxtral/modeling_voxtral.py +25 -24
  942. transformers/models/voxtral/modular_voxtral.py +26 -20
  943. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
  944. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
  945. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
  946. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
  947. transformers/models/wavlm/configuration_wavlm.py +4 -1
  948. transformers/models/wavlm/modeling_wavlm.py +4 -1
  949. transformers/models/whisper/configuration_whisper.py +6 -4
  950. transformers/models/whisper/generation_whisper.py +0 -1
  951. transformers/models/whisper/modeling_whisper.py +3 -3
  952. transformers/models/x_clip/configuration_x_clip.py +4 -1
  953. transformers/models/x_clip/modeling_x_clip.py +26 -27
  954. transformers/models/xglm/configuration_xglm.py +9 -7
  955. transformers/models/xlm/configuration_xlm.py +10 -7
  956. transformers/models/xlm/modeling_xlm.py +1 -1
  957. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
  958. transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
  959. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
  960. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
  961. transformers/models/xlnet/configuration_xlnet.py +3 -1
  962. transformers/models/xlstm/configuration_xlstm.py +5 -7
  963. transformers/models/xlstm/modeling_xlstm.py +0 -32
  964. transformers/models/xmod/configuration_xmod.py +11 -2
  965. transformers/models/xmod/modeling_xmod.py +13 -16
  966. transformers/models/yolos/image_processing_yolos_fast.py +25 -28
  967. transformers/models/yolos/modeling_yolos.py +7 -7
  968. transformers/models/yolos/modular_yolos.py +16 -16
  969. transformers/models/yoso/configuration_yoso.py +8 -1
  970. transformers/models/youtu/__init__.py +27 -0
  971. transformers/models/youtu/configuration_youtu.py +194 -0
  972. transformers/models/youtu/modeling_youtu.py +619 -0
  973. transformers/models/youtu/modular_youtu.py +254 -0
  974. transformers/models/zamba/configuration_zamba.py +5 -7
  975. transformers/models/zamba/modeling_zamba.py +25 -56
  976. transformers/models/zamba2/configuration_zamba2.py +8 -13
  977. transformers/models/zamba2/modeling_zamba2.py +53 -78
  978. transformers/models/zamba2/modular_zamba2.py +36 -29
  979. transformers/models/zoedepth/configuration_zoedepth.py +17 -40
  980. transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
  981. transformers/models/zoedepth/modeling_zoedepth.py +5 -3
  982. transformers/pipelines/__init__.py +1 -61
  983. transformers/pipelines/any_to_any.py +1 -1
  984. transformers/pipelines/automatic_speech_recognition.py +0 -2
  985. transformers/pipelines/base.py +1 -1
  986. transformers/pipelines/image_text_to_text.py +1 -1
  987. transformers/pipelines/text_to_audio.py +5 -1
  988. transformers/processing_utils.py +35 -44
  989. transformers/pytorch_utils.py +2 -26
  990. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  991. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  992. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  993. transformers/quantizers/quantizer_mxfp4.py +1 -1
  994. transformers/quantizers/quantizer_torchao.py +0 -16
  995. transformers/safetensors_conversion.py +11 -4
  996. transformers/testing_utils.py +3 -28
  997. transformers/tokenization_mistral_common.py +9 -0
  998. transformers/tokenization_python.py +6 -4
  999. transformers/tokenization_utils_base.py +119 -219
  1000. transformers/tokenization_utils_tokenizers.py +31 -2
  1001. transformers/trainer.py +25 -33
  1002. transformers/trainer_seq2seq.py +1 -1
  1003. transformers/training_args.py +411 -417
  1004. transformers/utils/__init__.py +1 -4
  1005. transformers/utils/auto_docstring.py +15 -18
  1006. transformers/utils/backbone_utils.py +13 -373
  1007. transformers/utils/doc.py +4 -36
  1008. transformers/utils/generic.py +69 -33
  1009. transformers/utils/import_utils.py +72 -75
  1010. transformers/utils/loading_report.py +133 -105
  1011. transformers/utils/quantization_config.py +0 -21
  1012. transformers/video_processing_utils.py +5 -5
  1013. transformers/video_utils.py +3 -1
  1014. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
  1015. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
  1016. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1017. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1018. transformers/pipelines/image_to_text.py +0 -189
  1019. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1020. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1021. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -36,12 +36,12 @@ from ... import initialization as init
36
36
  from ...activations import ACT2FN
37
37
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
38
38
  from ...modeling_layers import GradientCheckpointingLayer
39
- from ...modeling_outputs import BaseModelOutput
39
+ from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
40
40
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
41
41
  from ...processing_utils import Unpack
42
42
  from ...pytorch_utils import compile_compatible_method_lru_cache
43
- from ...utils import ModelOutput, auto_docstring
44
- from ...utils.generic import OutputRecorder, TransformersKwargs
43
+ from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging
44
+ from ...utils.generic import OutputRecorder, is_flash_attention_requested
45
45
  from ..auto import AutoModel
46
46
  from .configuration_sam3_tracker_video import (
47
47
  Sam3TrackerVideoConfig,
@@ -50,6 +50,9 @@ from .configuration_sam3_tracker_video import (
50
50
  )
51
51
 
52
52
 
53
+ logger = logging.get_logger(__name__)
54
+
55
+
53
56
  class Sam3TrackerVideoInferenceCache:
54
57
  """Cache for vision features and model constants."""
55
58
 
@@ -475,9 +478,18 @@ class Sam3TrackerVideoAttention(nn.Module):
475
478
  key = self.k_proj(key).view(*new_shape).transpose(1, 2)
476
479
  value = self.v_proj(value).view(*new_shape).transpose(1, 2)
477
480
 
478
- attention_interface: Callable = eager_attention_forward
479
- if self.config._attn_implementation != "eager":
480
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
481
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
482
+ self.config._attn_implementation, eager_attention_forward
483
+ )
484
+
485
+ if is_flash_attention_requested(self.config) and attention_similarity is not None:
486
+ # Target guided masks are represented as float masks and are incompatible with Flash Attention
487
+ # Fallback to SDPA for this call only so the rest of the model can still benefit from FA
488
+ attention_interface = ALL_ATTENTION_FUNCTIONS["sdpa"]
489
+ logger.warning_once(
490
+ "Falling back to SDPA for target-guided attention because "
491
+ "Flash Attention does not support additive bias masks."
492
+ )
481
493
 
482
494
  attn_output, attn_weights = attention_interface(
483
495
  self,
@@ -499,7 +511,7 @@ class Sam3TrackerVideoAttention(nn.Module):
499
511
  return attn_output, attn_weights
500
512
 
501
513
 
502
- class Sam3TrackerVideoTwoWayAttentionBlock(nn.Module):
514
+ class Sam3TrackerVideoTwoWayAttentionBlock(GradientCheckpointingLayer):
503
515
  def __init__(self, config: Sam3TrackerVideoMaskDecoderConfig, skip_first_layer_pe: bool = False):
504
516
  """
505
517
  A transformer block with four layers:
@@ -674,7 +686,7 @@ class Sam3TrackerVideoPreTrainedModel(PreTrainedModel):
674
686
  main_input_name = "pixel_values"
675
687
  input_modalities = "video"
676
688
  _supports_sdpa = True
677
- _supports_flash_attn_2 = True
689
+ _supports_flash_attn = True
678
690
  _supports_attention_backend = True
679
691
 
680
692
  @torch.no_grad()
@@ -859,9 +871,9 @@ class Sam3TrackerVideoRoPEAttention(nn.Module):
859
871
  query, key, cos, sin, repeat_freqs_k=self.rope_k_repeat, num_k_exclude_rope=num_k_exclude_rope
860
872
  )
861
873
 
862
- attention_interface: Callable = eager_attention_forward
863
- if self.config._attn_implementation != "eager":
864
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
874
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
875
+ self.config._attn_implementation, eager_attention_forward
876
+ )
865
877
 
866
878
  attn_output, attn_weights = attention_interface(
867
879
  self,
@@ -1124,16 +1136,10 @@ class Sam3TrackerVideoMemoryEncoder(nn.Module):
1124
1136
 
1125
1137
  @dataclass
1126
1138
  @auto_docstring(custom_intro="Base class for the vision encoder's outputs.")
1127
- class Sam3TrackerVideoVisionEncoderOutput(ModelOutput):
1139
+ class Sam3TrackerVideoVisionEncoderOutput(BaseModelOutputWithPooling):
1128
1140
  r"""
1129
1141
  last_hidden_state (`torch.FloatTensor` of shape `(batch_size, height, width, hidden_size)`):
1130
1142
  Sequence of hidden-states at the output of the last layer of the model.
1131
- fpn_hidden_states (`tuple(torch.FloatTensor)`):
1132
- Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
1133
- `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
1134
- fpn_position_encoding (`tuple(torch.FloatTensor)`):
1135
- Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
1136
- `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
1137
1143
  hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
1138
1144
  Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
1139
1145
  one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. Hidden-states of the
@@ -1142,13 +1148,16 @@ class Sam3TrackerVideoVisionEncoderOutput(ModelOutput):
1142
1148
  Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
1143
1149
  sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
1144
1150
  the self-attention heads.
1151
+ fpn_hidden_states (`tuple(torch.FloatTensor)`):
1152
+ Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
1153
+ `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck.
1154
+ fpn_position_encoding (`tuple(torch.FloatTensor)`):
1155
+ Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape
1156
+ `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`.
1145
1157
  """
1146
1158
 
1147
- last_hidden_state: torch.FloatTensor | None = None
1148
1159
  fpn_hidden_states: torch.FloatTensor | None = None
1149
1160
  fpn_position_encoding: torch.FloatTensor | None = None
1150
- hidden_states: tuple[torch.FloatTensor, ...] | None = None
1151
- attentions: tuple[torch.FloatTensor, ...] | None = None
1152
1161
 
1153
1162
 
1154
1163
  class Sam3TrackerVideoPositionalEmbedding(nn.Module):
@@ -1579,6 +1588,7 @@ def get_1d_sine_pe(pos_inds, dim, temperature=10000):
1579
1588
  class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
1580
1589
  input_modalities = ("video", "text")
1581
1590
  _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(Sam3TrackerVideoTwoWayAttentionBlock, index=2)}
1591
+ _tied_weights_keys = {}
1582
1592
  _keys_to_ignore_on_load_unexpected = [r"^detector_model."]
1583
1593
  _checkpoint_conversion_mapping = {
1584
1594
  r"tracker_model.(.+)": r"\1", # the regex allows to remove the prefix, and add it back in revert mode
@@ -1675,7 +1685,8 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
1675
1685
  Input pixel values
1676
1686
  """
1677
1687
  batch_size = pixel_values.shape[0]
1678
- feature_maps, _, _, _ = self.get_image_features(pixel_values, **kwargs)
1688
+ image_outputs = self.get_image_features(pixel_values, return_dict=True, **kwargs)
1689
+ feature_maps = image_outputs.fpn_hidden_states
1679
1690
 
1680
1691
  # add no memory embedding to the last feature map
1681
1692
  feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding
@@ -1846,33 +1857,19 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
1846
1857
  frame_idx=frame_idx,
1847
1858
  )
1848
1859
 
1860
+ @can_return_tuple
1861
+ @auto_docstring
1849
1862
  def get_image_features(
1850
1863
  self,
1851
1864
  pixel_values: torch.FloatTensor,
1852
1865
  **kwargs: Unpack[TransformersKwargs],
1853
- ) -> tuple[
1854
- list[torch.Tensor],
1855
- list[torch.Tensor],
1856
- tuple[torch.FloatTensor, ...] | None,
1857
- tuple[torch.FloatTensor, ...] | None,
1858
- ]:
1866
+ ) -> tuple | Sam3TrackerVideoVisionEncoderOutput:
1859
1867
  r"""
1860
- Extract and preprocess image features using the vision encoder.
1861
-
1862
- Args:
1863
- pixel_values (`torch.FloatTensor`):
1864
- Input pixel values of shape `(batch_size, num_channels, height, width)`.
1865
-
1866
- Returns:
1867
- `tuple`: A tuple containing:
1868
- - feature_maps (`list[torch.Tensor]`): List of feature maps from different levels.
1869
- - feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level.
1870
- - vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder.
1871
- - vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder.
1868
+ pixel_values (`torch.FloatTensor`):
1869
+ Input pixel values of shape `(batch_size, num_channels, height, width)`.
1872
1870
  """
1873
1871
  vision_outputs: Sam3TrackerVideoVisionEncoderOutput = self.vision_encoder(
1874
- pixel_values,
1875
- **kwargs,
1872
+ pixel_values, return_dict=True, **kwargs
1876
1873
  )
1877
1874
 
1878
1875
  feature_maps = vision_outputs.fpn_hidden_states
@@ -1890,8 +1887,10 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
1890
1887
  feature_map_position_embedding.flatten(2).permute(2, 0, 1)
1891
1888
  for feature_map_position_embedding in feature_maps_position_embeddings[:-1]
1892
1889
  ]
1890
+ vision_outputs.fpn_hidden_states = feature_maps
1891
+ vision_outputs.fpn_position_encoding = feature_maps_position_embeddings
1893
1892
 
1894
- return feature_maps, feature_maps_position_embeddings, vision_outputs.hidden_states, vision_outputs.attentions
1893
+ return vision_outputs
1895
1894
 
1896
1895
  def _prepare_vision_features(
1897
1896
  self,
@@ -1908,7 +1907,9 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
1908
1907
  else:
1909
1908
  # Compute features using image encoder
1910
1909
  image_batch = inference_session.get_frame(frame_idx).unsqueeze(0) # Add batch dimension
1911
- vision_feats, vision_pos_embeds, _, _ = self.get_image_features(image_batch)
1910
+ image_outputs = self.get_image_features(image_batch, return_dict=True)
1911
+ vision_feats = image_outputs.fpn_hidden_states
1912
+ vision_pos_embeds = image_outputs.fpn_position_encoding
1912
1913
  # Cache features
1913
1914
  inference_session.cache.cache_vision_features(
1914
1915
  frame_idx, {"vision_feats": vision_feats, "vision_pos_embeds": vision_pos_embeds}
@@ -2013,10 +2014,10 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
2013
2014
  vision_hidden_states = None
2014
2015
 
2015
2016
  if pixel_values is not None:
2016
- feature_maps, _, vision_hidden_states, vision_attentions = self.get_image_features(
2017
- pixel_values,
2018
- **kwargs,
2019
- )
2017
+ image_outputs = self.get_image_features(pixel_values, return_dict=True, **kwargs)
2018
+ feature_maps = image_outputs.fpn_hidden_states
2019
+ vision_hidden_states = image_outputs.hidden_states
2020
+ vision_attentions = image_outputs.attentions
2020
2021
 
2021
2022
  # add no memory embedding to the last feature map
2022
2023
  feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding
@@ -2506,7 +2507,7 @@ class Sam3TrackerVideoModel(Sam3TrackerVideoPreTrainedModel):
2506
2507
  num_object_pointer_tokens = object_pointers.shape[0]
2507
2508
 
2508
2509
  # Step 4: Concatenate all retrieved memories and their positional embeddings
2509
- combined_memory = torch.cat(memories_to_concatenate, dim=0)
2510
+ combined_memory = torch.cat(memories_to_concatenate, dim=0).to(dtype=inference_session.dtype)
2510
2511
  combined_memory_positional_embeddings = torch.cat(memory_positional_embeddings_to_concatenate, dim=0)
2511
2512
 
2512
2513
  # Step 5: Forward through the memory attention mechanism
@@ -17,7 +17,7 @@ import torch
17
17
 
18
18
  from ...configuration_utils import PreTrainedConfig
19
19
  from ...processing_utils import Unpack
20
- from ...utils.generic import TransformersKwargs
20
+ from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
21
21
  from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel
22
22
  from ..sam2_video.configuration_sam2_video import Sam2VideoMaskDecoderConfig, Sam2VideoPromptEncoderConfig
23
23
  from ..sam2_video.modeling_sam2_video import (
@@ -544,33 +544,19 @@ class Sam3TrackerVideoModel(Sam2VideoModel):
544
544
 
545
545
  self.post_init()
546
546
 
547
+ @can_return_tuple
548
+ @auto_docstring
547
549
  def get_image_features(
548
550
  self,
549
551
  pixel_values: torch.FloatTensor,
550
552
  **kwargs: Unpack[TransformersKwargs],
551
- ) -> tuple[
552
- list[torch.Tensor],
553
- list[torch.Tensor],
554
- tuple[torch.FloatTensor, ...] | None,
555
- tuple[torch.FloatTensor, ...] | None,
556
- ]:
553
+ ) -> tuple | Sam3TrackerVideoVisionEncoderOutput:
557
554
  r"""
558
- Extract and preprocess image features using the vision encoder.
559
-
560
- Args:
561
- pixel_values (`torch.FloatTensor`):
562
- Input pixel values of shape `(batch_size, num_channels, height, width)`.
563
-
564
- Returns:
565
- `tuple`: A tuple containing:
566
- - feature_maps (`list[torch.Tensor]`): List of feature maps from different levels.
567
- - feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level.
568
- - vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder.
569
- - vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder.
555
+ pixel_values (`torch.FloatTensor`):
556
+ Input pixel values of shape `(batch_size, num_channels, height, width)`.
570
557
  """
571
558
  vision_outputs: Sam3TrackerVideoVisionEncoderOutput = self.vision_encoder(
572
- pixel_values,
573
- **kwargs,
559
+ pixel_values, return_dict=True, **kwargs
574
560
  )
575
561
 
576
562
  feature_maps = vision_outputs.fpn_hidden_states
@@ -588,8 +574,10 @@ class Sam3TrackerVideoModel(Sam2VideoModel):
588
574
  feature_map_position_embedding.flatten(2).permute(2, 0, 1)
589
575
  for feature_map_position_embedding in feature_maps_position_embeddings[:-1]
590
576
  ]
577
+ vision_outputs.fpn_hidden_states = feature_maps
578
+ vision_outputs.fpn_position_encoding = feature_maps_position_embeddings
591
579
 
592
- return feature_maps, feature_maps_position_embeddings, vision_outputs.hidden_states, vision_outputs.attentions
580
+ return vision_outputs
593
581
 
594
582
 
595
583
  __all__ = [
@@ -14,6 +14,7 @@
14
14
 
15
15
 
16
16
  from collections import OrderedDict, defaultdict
17
+ from collections.abc import Iterator
17
18
  from copy import deepcopy
18
19
  from dataclasses import dataclass
19
20
  from typing import Any
@@ -54,7 +55,7 @@ def _load_cv_utils_kernel_once():
54
55
  return
55
56
 
56
57
  try:
57
- cv_utils_kernel = get_kernel("kernels-community/cv_utils")
58
+ cv_utils_kernel = get_kernel("kernels-community/cv-utils")
58
59
  except Exception as e:
59
60
  logger.warning_once(
60
61
  f"Failed to load cv_utils kernel (your torch/cuda setup may not be supported): {e}. "
@@ -590,7 +591,8 @@ class Sam3VideoModel(Sam3VideoPreTrainedModel):
590
591
  text_embeds = self.detector_model.get_text_features(
591
592
  input_ids=inference_session.prompt_input_ids[prompt_id],
592
593
  attention_mask=inference_session.prompt_attention_masks[prompt_id],
593
- )
594
+ return_dict=True,
595
+ ).pooler_output
594
596
  inference_session.prompt_embeddings[prompt_id] = text_embeds
595
597
  else:
596
598
  text_embeds = inference_session.prompt_embeddings[prompt_id]
@@ -1780,20 +1782,31 @@ class Sam3VideoModel(Sam3VideoPreTrainedModel):
1780
1782
  return processing_order, end_frame_idx
1781
1783
 
1782
1784
  @torch.inference_mode()
1785
+ @auto_docstring(
1786
+ custom_intro="""
1787
+ Propagate the prompts to get grounding results for the entire video. Used when initializing an inference session with a whole video.
1788
+ Yields Sam3VideoSegmentationOutput for each frame.
1789
+ """
1790
+ )
1783
1791
  def propagate_in_video_iterator(
1784
1792
  self,
1785
1793
  inference_session: Sam3VideoInferenceSession,
1786
- start_frame_idx=0,
1787
- max_frame_num_to_track=None,
1788
- reverse=False,
1789
- ):
1790
- """
1791
- Propagate the prompts to get grounding results for the entire video. This method
1792
- is a generator and yields inference outputs for all frames in the range specified
1793
- by `start_frame_idx`, `max_frame_num_to_track`, and `reverse`.
1794
-
1795
- Yields:
1796
- `Sam3VideoSegmentationOutput`: The segmentation output for each frame.
1794
+ start_frame_idx: int = 0,
1795
+ max_frame_num_to_track: int | None = None,
1796
+ reverse: bool = False,
1797
+ show_progress_bar: bool = False,
1798
+ ) -> Iterator[Sam3VideoSegmentationOutput]:
1799
+ r"""
1800
+ inference_session (`Sam3VideoInferenceSession`):
1801
+ The video inference session object.
1802
+ start_frame_idx (`int`, *optional*, defaults to `0`):
1803
+ The starting frame index for propagation.
1804
+ max_frame_num_to_track (`int`, *optional*):
1805
+ The maximum number of frames to track. If not provided, all frames in the video will be tracked.
1806
+ reverse (`bool`, *optional*, defaults to `False`):
1807
+ Whether to propagate in reverse.
1808
+ show_progress_bar (`bool`, *optional*, defaults to `False`):
1809
+ Whether to show a progress bar during propagation.
1797
1810
  """
1798
1811
  processing_order, end_frame_idx = self._get_processing_order(
1799
1812
  inference_session,
@@ -1803,7 +1816,7 @@ class Sam3VideoModel(Sam3VideoPreTrainedModel):
1803
1816
  )
1804
1817
 
1805
1818
  hotstart_buffer = []
1806
- for frame_idx in tqdm(processing_order):
1819
+ for frame_idx in tqdm(processing_order, desc="propagate in video", disable=not show_progress_bar):
1807
1820
  out = self(inference_session=inference_session, frame_idx=frame_idx, reverse=reverse)
1808
1821
 
1809
1822
  if self.hotstart_delay > 0:
@@ -290,6 +290,7 @@ class SamHQConfig(PreTrainedConfig):
290
290
  prompt_encoder_config=None,
291
291
  mask_decoder_config=None,
292
292
  initializer_range=0.02,
293
+ tie_word_embeddings=True,
293
294
  **kwargs,
294
295
  ):
295
296
  vision_config = vision_config if vision_config is not None else {}
@@ -307,6 +308,7 @@ class SamHQConfig(PreTrainedConfig):
307
308
  self.prompt_encoder_config = SamHQPromptEncoderConfig(**prompt_encoder_config)
308
309
  self.mask_decoder_config = SamHQMaskDecoderConfig(**mask_decoder_config)
309
310
  self.initializer_range = initializer_range
311
+ self.tie_word_embeddings = tie_word_embeddings
310
312
  super().__init__(**kwargs)
311
313
 
312
314
 
@@ -415,7 +415,7 @@ class SamHQPositionalEmbedding(nn.Module):
415
415
  def __init__(self, config):
416
416
  super().__init__()
417
417
  self.scale = config.scale
418
- self.register_buffer("positional_embedding", self.scale * torch.randn((2, config.num_pos_feats)))
418
+ self.positional_embedding = nn.Parameter(self.scale * torch.randn((2, config.num_pos_feats)))
419
419
 
420
420
  def forward(self, input_coords, input_shape=None):
421
421
  """Positionally encode points that are normalized to [0,1]."""
@@ -685,9 +685,9 @@ class SamHQAttention(nn.Module):
685
685
  value = self._separate_heads(value, self.num_attention_heads)
686
686
 
687
687
  # SamHQAttention
688
- attention_interface: Callable = eager_attention_forward
689
- if self.config._attn_implementation != "eager":
690
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
688
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
689
+ self.config._attn_implementation, eager_attention_forward
690
+ )
691
691
 
692
692
  attn_output, attn_weights = attention_interface(
693
693
  self,
@@ -1233,7 +1233,9 @@ class SamHQPromptEncoder(nn.Module):
1233
1233
  class SamHQModel(SamHQPreTrainedModel):
1234
1234
  input_modalities = ("image", "text")
1235
1235
  _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(SamHQTwoWayAttentionBlock, index=2)}
1236
- _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"]
1236
+ _tied_weights_keys = {
1237
+ "prompt_encoder.shared_embedding.positional_embedding": "shared_image_embedding.positional_embedding"
1238
+ }
1237
1239
 
1238
1240
  def __init__(self, config):
1239
1241
  super().__init__(config)
@@ -1393,16 +1395,18 @@ class SamHQModel(SamHQPreTrainedModel):
1393
1395
 
1394
1396
  ```python
1395
1397
  >>> from PIL import Image
1396
- >>> import requests
1398
+ >>> import httpx
1399
+ >>> from io import BytesIO
1397
1400
  >>> from transformers import AutoModel, AutoProcessor
1398
1401
 
1399
1402
  >>> model = AutoModel.from_pretrained("sushmanth/sam_hq_vit_b")
1400
1403
  >>> processor = AutoProcessor.from_pretrained("sushmanth/sam_hq_vit_b")
1401
1404
 
1402
- >>> img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
1403
- >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
1405
+ >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
1406
+ >>> with httpx.stream("GET", url) as response:
1407
+ ... image = Image.open(BytesIO(response.read())).convert("RGB")
1404
1408
  >>> input_points = [[[400, 650]]] # 2D location of a window on the car
1405
- >>> inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt")
1409
+ >>> inputs = processor(images=image, input_points=input_points, return_tensors="pt")
1406
1410
 
1407
1411
  >>> # Get high-quality segmentation mask
1408
1412
  >>> outputs = model(**inputs)
@@ -440,8 +440,6 @@ class SamHQVisionModel(SamVisionModel):
440
440
  """
441
441
  )
442
442
  class SamHQModel(SamModel):
443
- _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"]
444
-
445
443
  def __init__(self, config):
446
444
  super().__init__(config)
447
445
  self.vision_encoder = SamHQVisionEncoder(config.vision_config)
@@ -546,16 +544,18 @@ class SamHQModel(SamModel):
546
544
 
547
545
  ```python
548
546
  >>> from PIL import Image
549
- >>> import requests
547
+ >>> import httpx
548
+ >>> from io import BytesIO
550
549
  >>> from transformers import AutoModel, AutoProcessor
551
550
 
552
551
  >>> model = AutoModel.from_pretrained("sushmanth/sam_hq_vit_b")
553
552
  >>> processor = AutoProcessor.from_pretrained("sushmanth/sam_hq_vit_b")
554
553
 
555
- >>> img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
556
- >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
554
+ >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
555
+ >>> with httpx.stream("GET", url) as response:
556
+ ... image = Image.open(BytesIO(response.read())).convert("RGB")
557
557
  >>> input_points = [[[400, 650]]] # 2D location of a window on the car
558
- >>> inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt")
558
+ >>> inputs = processor(images=image, input_points=input_points, return_tensors="pt")
559
559
 
560
560
  >>> # Get high-quality segmentation mask
561
561
  >>> outputs = model(**inputs)
@@ -16,19 +16,20 @@ Processor class for SAMHQ.
16
16
  """
17
17
 
18
18
  from copy import deepcopy
19
+ from typing import Union
19
20
 
20
21
  import numpy as np
21
22
 
23
+ from ...feature_extraction_utils import BatchFeature
22
24
  from ...image_utils import ImageInput
23
25
  from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
24
- from ...tokenization_utils_base import BatchEncoding
25
26
  from ...utils import auto_docstring, is_torch_available
26
27
 
27
28
 
28
29
  if is_torch_available():
29
30
  import torch
30
31
 
31
- NestedList = list[float | int | None | list[float | int | None]]
32
+ NestedList = list[Union[float | int | None, "NestedList"]]
32
33
 
33
34
 
34
35
  class SamHQImagesKwargs(ImagesKwargs, total=False):
@@ -61,9 +62,9 @@ class SamHQImagesKwargs(ImagesKwargs, total=False):
61
62
  """
62
63
 
63
64
  segmentation_maps: ImageInput | None
64
- input_points: NestedList | None
65
- input_labels: NestedList | None
66
- input_boxes: NestedList | None
65
+ input_points: "NestedList | torch.Tensor | None"
66
+ input_labels: "NestedList | int | torch.Tensor | None"
67
+ input_boxes: "NestedList | torch.Tensor | None"
67
68
  point_pad_value: int | None
68
69
  mask_size: dict[str, int]
69
70
  mask_pad_size: dict[str, int]
@@ -94,7 +95,7 @@ class SamHQProcessor(ProcessorMixin):
94
95
  self,
95
96
  images: ImageInput | None = None,
96
97
  **kwargs: Unpack[SamHQProcessorKwargs],
97
- ) -> BatchEncoding:
98
+ ) -> BatchFeature:
98
99
  output_kwargs = self._merge_kwargs(
99
100
  SamHQProcessorKwargs,
100
101
  tokenizer_init_kwargs={},
@@ -316,6 +316,7 @@ class SeamlessM4TConfig(PreTrainedConfig):
316
316
  variance_predictor_kernel_size=3,
317
317
  var_pred_dropout=0.5,
318
318
  vocoder_offset=4,
319
+ tie_word_embeddings=True,
319
320
  **kwargs,
320
321
  ):
321
322
  # overall_config
@@ -334,6 +335,7 @@ class SeamlessM4TConfig(PreTrainedConfig):
334
335
  self.attention_dropout = attention_dropout
335
336
  self.activation_dropout = activation_dropout
336
337
  self.scale_embedding = scale_embedding
338
+ self.tie_word_embeddings = tie_word_embeddings
337
339
  # for proper config init
338
340
  self.num_attention_heads = decoder_attention_heads
339
341
  self.num_hidden_layers = decoder_layers
@@ -400,16 +402,13 @@ class SeamlessM4TConfig(PreTrainedConfig):
400
402
  self.variance_predictor_kernel_size = variance_predictor_kernel_size
401
403
  self.var_pred_dropout = var_pred_dropout
402
404
  self.vocoder_offset = vocoder_offset
405
+ self.pad_token_id = pad_token_id
406
+ self.bos_token_id = bos_token_id
407
+ self.eos_token_id = eos_token_id
408
+ self.max_position_embeddings = max_position_embeddings
409
+ self.decoder_start_token_id = decoder_start_token_id
403
410
 
404
- super().__init__(
405
- pad_token_id=pad_token_id,
406
- bos_token_id=bos_token_id,
407
- eos_token_id=eos_token_id,
408
- decoder_start_token_id=decoder_start_token_id,
409
- is_encoder_decoder=is_encoder_decoder,
410
- max_position_embeddings=max_position_embeddings,
411
- **kwargs,
412
- )
411
+ super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
413
412
 
414
413
 
415
414
  __all__ = ["SeamlessM4TConfig"]
@@ -322,6 +322,7 @@ class SeamlessM4Tv2Config(PreTrainedConfig):
322
322
  variance_predictor_kernel_size=3,
323
323
  var_pred_dropout=0.5,
324
324
  vocoder_offset=4,
325
+ tie_word_embeddings=True,
325
326
  **kwargs,
326
327
  ):
327
328
  # overall_config
@@ -341,6 +342,7 @@ class SeamlessM4Tv2Config(PreTrainedConfig):
341
342
  self.attention_dropout = attention_dropout
342
343
  self.activation_dropout = activation_dropout
343
344
  self.scale_embedding = scale_embedding
345
+ self.tie_word_embeddings = tie_word_embeddings
344
346
  # for proper config init
345
347
  self.num_attention_heads = decoder_attention_heads
346
348
  self.num_hidden_layers = decoder_layers
@@ -409,16 +411,13 @@ class SeamlessM4Tv2Config(PreTrainedConfig):
409
411
  self.variance_predictor_kernel_size = variance_predictor_kernel_size
410
412
  self.var_pred_dropout = var_pred_dropout
411
413
  self.vocoder_offset = vocoder_offset
414
+ self.pad_token_id = pad_token_id
415
+ self.bos_token_id = bos_token_id
416
+ self.eos_token_id = eos_token_id
417
+ self.decoder_start_token_id = decoder_start_token_id
418
+ self.max_position_embeddings = max_position_embeddings
412
419
 
413
- super().__init__(
414
- pad_token_id=pad_token_id,
415
- bos_token_id=bos_token_id,
416
- eos_token_id=eos_token_id,
417
- decoder_start_token_id=decoder_start_token_id,
418
- is_encoder_decoder=is_encoder_decoder,
419
- max_position_embeddings=max_position_embeddings,
420
- **kwargs,
421
- )
420
+ super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
422
421
 
423
422
 
424
423
  __all__ = ["SeamlessM4Tv2Config"]
@@ -13,8 +13,8 @@
13
13
  # limitations under the License.
14
14
  """SeedOss model configuration"""
15
15
 
16
- from transformers.configuration_utils import PreTrainedConfig
17
- from transformers.modeling_rope_utils import RopeParameters
16
+ from ...configuration_utils import PreTrainedConfig
17
+ from ...modeling_rope_utils import RopeParameters
18
18
 
19
19
 
20
20
  class SeedOssConfig(PreTrainedConfig):
@@ -170,13 +170,11 @@ class SeedOssConfig(PreTrainedConfig):
170
170
  self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
171
171
  self.rope_parameters = rope_parameters
172
172
 
173
- super().__init__(
174
- pad_token_id=pad_token_id,
175
- bos_token_id=bos_token_id,
176
- eos_token_id=eos_token_id,
177
- tie_word_embeddings=tie_word_embeddings,
178
- **kwargs,
179
- )
173
+ self.tie_word_embeddings = tie_word_embeddings
174
+ self.pad_token_id = pad_token_id
175
+ self.bos_token_id = bos_token_id
176
+ self.eos_token_id = eos_token_id
177
+ super().__init__(**kwargs)
180
178
 
181
179
 
182
180
  __all__ = ["SeedOssConfig"]
@@ -206,9 +206,9 @@ class SeedOssAttention(nn.Module):
206
206
  cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
207
207
  key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
208
208
 
209
- attention_interface: Callable = eager_attention_forward
210
- if self.config._attn_implementation != "eager":
211
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
209
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
210
+ self.config._attn_implementation, eager_attention_forward
211
+ )
212
212
 
213
213
  attn_output, attn_weights = attention_interface(
214
214
  self,
@@ -440,7 +440,7 @@ class SeedOssModel(SeedOssPreTrainedModel):
440
440
  @auto_docstring
441
441
  class SeedOssForCausalLM(SeedOssPreTrainedModel, GenerationMixin):
442
442
  _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
443
- _tp_plan = {"lm_head": "colwise_rep"}
443
+ _tp_plan = {"lm_head": "colwise_gather_output"}
444
444
  _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
445
445
 
446
446
  def __init__(self, config):
@@ -118,9 +118,9 @@ class SeedOssAttention(nn.Module):
118
118
  cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
119
119
  key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
120
120
 
121
- attention_interface: Callable = eager_attention_forward
122
- if self.config._attn_implementation != "eager":
123
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
121
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
122
+ self.config._attn_implementation, eager_attention_forward
123
+ )
124
124
 
125
125
  attn_output, attn_weights = attention_interface(
126
126
  self,