transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1021) hide show
  1. transformers/__init__.py +4 -11
  2. transformers/activations.py +2 -2
  3. transformers/backbone_utils.py +326 -0
  4. transformers/cache_utils.py +11 -2
  5. transformers/cli/serve.py +11 -8
  6. transformers/configuration_utils.py +1 -69
  7. transformers/conversion_mapping.py +146 -26
  8. transformers/convert_slow_tokenizer.py +6 -4
  9. transformers/core_model_loading.py +207 -118
  10. transformers/dependency_versions_check.py +0 -1
  11. transformers/dependency_versions_table.py +7 -8
  12. transformers/file_utils.py +0 -2
  13. transformers/generation/candidate_generator.py +1 -2
  14. transformers/generation/continuous_batching/cache.py +40 -38
  15. transformers/generation/continuous_batching/cache_manager.py +3 -16
  16. transformers/generation/continuous_batching/continuous_api.py +94 -406
  17. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  18. transformers/generation/continuous_batching/requests.py +54 -17
  19. transformers/generation/continuous_batching/scheduler.py +77 -95
  20. transformers/generation/logits_process.py +10 -5
  21. transformers/generation/stopping_criteria.py +1 -2
  22. transformers/generation/utils.py +75 -95
  23. transformers/image_processing_utils.py +0 -3
  24. transformers/image_processing_utils_fast.py +17 -18
  25. transformers/image_transforms.py +44 -13
  26. transformers/image_utils.py +0 -5
  27. transformers/initialization.py +57 -0
  28. transformers/integrations/__init__.py +10 -24
  29. transformers/integrations/accelerate.py +47 -11
  30. transformers/integrations/deepspeed.py +145 -3
  31. transformers/integrations/executorch.py +2 -6
  32. transformers/integrations/finegrained_fp8.py +142 -7
  33. transformers/integrations/flash_attention.py +2 -7
  34. transformers/integrations/hub_kernels.py +18 -7
  35. transformers/integrations/moe.py +226 -106
  36. transformers/integrations/mxfp4.py +47 -34
  37. transformers/integrations/peft.py +488 -176
  38. transformers/integrations/tensor_parallel.py +641 -581
  39. transformers/masking_utils.py +153 -9
  40. transformers/modeling_flash_attention_utils.py +1 -2
  41. transformers/modeling_utils.py +359 -358
  42. transformers/models/__init__.py +6 -0
  43. transformers/models/afmoe/configuration_afmoe.py +14 -4
  44. transformers/models/afmoe/modeling_afmoe.py +8 -8
  45. transformers/models/afmoe/modular_afmoe.py +7 -7
  46. transformers/models/aimv2/configuration_aimv2.py +2 -7
  47. transformers/models/aimv2/modeling_aimv2.py +26 -24
  48. transformers/models/aimv2/modular_aimv2.py +8 -12
  49. transformers/models/albert/configuration_albert.py +8 -1
  50. transformers/models/albert/modeling_albert.py +3 -3
  51. transformers/models/align/configuration_align.py +8 -5
  52. transformers/models/align/modeling_align.py +22 -24
  53. transformers/models/altclip/configuration_altclip.py +4 -6
  54. transformers/models/altclip/modeling_altclip.py +30 -26
  55. transformers/models/apertus/configuration_apertus.py +5 -7
  56. transformers/models/apertus/modeling_apertus.py +4 -4
  57. transformers/models/apertus/modular_apertus.py +8 -10
  58. transformers/models/arcee/configuration_arcee.py +5 -7
  59. transformers/models/arcee/modeling_arcee.py +4 -4
  60. transformers/models/aria/configuration_aria.py +11 -21
  61. transformers/models/aria/modeling_aria.py +39 -36
  62. transformers/models/aria/modular_aria.py +33 -39
  63. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
  64. transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
  65. transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
  66. transformers/models/auto/auto_factory.py +8 -6
  67. transformers/models/auto/configuration_auto.py +22 -0
  68. transformers/models/auto/image_processing_auto.py +17 -13
  69. transformers/models/auto/modeling_auto.py +15 -0
  70. transformers/models/auto/processing_auto.py +9 -18
  71. transformers/models/auto/tokenization_auto.py +17 -15
  72. transformers/models/autoformer/modeling_autoformer.py +2 -1
  73. transformers/models/aya_vision/configuration_aya_vision.py +4 -0
  74. transformers/models/aya_vision/modeling_aya_vision.py +29 -62
  75. transformers/models/aya_vision/modular_aya_vision.py +20 -45
  76. transformers/models/bamba/configuration_bamba.py +17 -7
  77. transformers/models/bamba/modeling_bamba.py +23 -55
  78. transformers/models/bamba/modular_bamba.py +19 -54
  79. transformers/models/bark/configuration_bark.py +2 -1
  80. transformers/models/bark/modeling_bark.py +24 -10
  81. transformers/models/bart/configuration_bart.py +9 -4
  82. transformers/models/bart/modeling_bart.py +9 -12
  83. transformers/models/beit/configuration_beit.py +2 -4
  84. transformers/models/beit/image_processing_beit_fast.py +3 -3
  85. transformers/models/beit/modeling_beit.py +14 -9
  86. transformers/models/bert/configuration_bert.py +12 -1
  87. transformers/models/bert/modeling_bert.py +6 -30
  88. transformers/models/bert_generation/configuration_bert_generation.py +17 -1
  89. transformers/models/bert_generation/modeling_bert_generation.py +6 -6
  90. transformers/models/big_bird/configuration_big_bird.py +12 -8
  91. transformers/models/big_bird/modeling_big_bird.py +0 -15
  92. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
  93. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
  94. transformers/models/biogpt/configuration_biogpt.py +8 -1
  95. transformers/models/biogpt/modeling_biogpt.py +4 -8
  96. transformers/models/biogpt/modular_biogpt.py +1 -5
  97. transformers/models/bit/configuration_bit.py +2 -4
  98. transformers/models/bit/modeling_bit.py +6 -5
  99. transformers/models/bitnet/configuration_bitnet.py +5 -7
  100. transformers/models/bitnet/modeling_bitnet.py +3 -4
  101. transformers/models/bitnet/modular_bitnet.py +3 -4
  102. transformers/models/blenderbot/configuration_blenderbot.py +8 -4
  103. transformers/models/blenderbot/modeling_blenderbot.py +4 -4
  104. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
  105. transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
  106. transformers/models/blip/configuration_blip.py +9 -9
  107. transformers/models/blip/modeling_blip.py +55 -37
  108. transformers/models/blip_2/configuration_blip_2.py +2 -1
  109. transformers/models/blip_2/modeling_blip_2.py +81 -56
  110. transformers/models/bloom/configuration_bloom.py +5 -1
  111. transformers/models/bloom/modeling_bloom.py +2 -1
  112. transformers/models/blt/configuration_blt.py +23 -12
  113. transformers/models/blt/modeling_blt.py +20 -14
  114. transformers/models/blt/modular_blt.py +70 -10
  115. transformers/models/bridgetower/configuration_bridgetower.py +7 -1
  116. transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
  117. transformers/models/bridgetower/modeling_bridgetower.py +29 -15
  118. transformers/models/bros/configuration_bros.py +24 -17
  119. transformers/models/camembert/configuration_camembert.py +8 -1
  120. transformers/models/camembert/modeling_camembert.py +6 -6
  121. transformers/models/canine/configuration_canine.py +4 -1
  122. transformers/models/chameleon/configuration_chameleon.py +5 -7
  123. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
  124. transformers/models/chameleon/modeling_chameleon.py +82 -36
  125. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
  126. transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
  127. transformers/models/clap/configuration_clap.py +4 -8
  128. transformers/models/clap/modeling_clap.py +21 -22
  129. transformers/models/clip/configuration_clip.py +4 -1
  130. transformers/models/clip/image_processing_clip_fast.py +9 -0
  131. transformers/models/clip/modeling_clip.py +25 -22
  132. transformers/models/clipseg/configuration_clipseg.py +4 -1
  133. transformers/models/clipseg/modeling_clipseg.py +27 -25
  134. transformers/models/clipseg/processing_clipseg.py +11 -3
  135. transformers/models/clvp/configuration_clvp.py +14 -2
  136. transformers/models/clvp/modeling_clvp.py +19 -30
  137. transformers/models/codegen/configuration_codegen.py +4 -3
  138. transformers/models/codegen/modeling_codegen.py +2 -1
  139. transformers/models/cohere/configuration_cohere.py +5 -7
  140. transformers/models/cohere/modeling_cohere.py +4 -4
  141. transformers/models/cohere/modular_cohere.py +3 -3
  142. transformers/models/cohere2/configuration_cohere2.py +6 -8
  143. transformers/models/cohere2/modeling_cohere2.py +4 -4
  144. transformers/models/cohere2/modular_cohere2.py +9 -11
  145. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  146. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
  147. transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
  148. transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
  149. transformers/models/colqwen2/modeling_colqwen2.py +7 -6
  150. transformers/models/colqwen2/modular_colqwen2.py +7 -6
  151. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
  152. transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
  153. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
  154. transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
  155. transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
  156. transformers/models/convbert/configuration_convbert.py +11 -7
  157. transformers/models/convnext/configuration_convnext.py +2 -4
  158. transformers/models/convnext/image_processing_convnext_fast.py +2 -2
  159. transformers/models/convnext/modeling_convnext.py +7 -6
  160. transformers/models/convnextv2/configuration_convnextv2.py +2 -4
  161. transformers/models/convnextv2/modeling_convnextv2.py +7 -6
  162. transformers/models/cpmant/configuration_cpmant.py +4 -0
  163. transformers/models/csm/configuration_csm.py +9 -15
  164. transformers/models/csm/modeling_csm.py +3 -3
  165. transformers/models/ctrl/configuration_ctrl.py +16 -0
  166. transformers/models/ctrl/modeling_ctrl.py +13 -25
  167. transformers/models/cwm/configuration_cwm.py +5 -7
  168. transformers/models/cwm/modeling_cwm.py +4 -4
  169. transformers/models/d_fine/configuration_d_fine.py +10 -56
  170. transformers/models/d_fine/modeling_d_fine.py +728 -868
  171. transformers/models/d_fine/modular_d_fine.py +335 -412
  172. transformers/models/dab_detr/configuration_dab_detr.py +22 -48
  173. transformers/models/dab_detr/modeling_dab_detr.py +11 -7
  174. transformers/models/dac/modeling_dac.py +1 -1
  175. transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
  176. transformers/models/data2vec/configuration_data2vec_text.py +11 -2
  177. transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
  178. transformers/models/data2vec/modeling_data2vec_text.py +6 -6
  179. transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
  180. transformers/models/dbrx/configuration_dbrx.py +11 -3
  181. transformers/models/dbrx/modeling_dbrx.py +6 -6
  182. transformers/models/dbrx/modular_dbrx.py +6 -6
  183. transformers/models/deberta/configuration_deberta.py +6 -0
  184. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
  185. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
  186. transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
  187. transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
  188. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
  189. transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
  190. transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
  191. transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
  192. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
  193. transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
  194. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
  195. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
  196. transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
  197. transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
  198. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
  199. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
  200. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
  201. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
  202. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
  203. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
  204. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
  205. transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
  206. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
  207. transformers/models/deit/modeling_deit.py +11 -7
  208. transformers/models/depth_anything/configuration_depth_anything.py +12 -42
  209. transformers/models/depth_anything/modeling_depth_anything.py +5 -3
  210. transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
  211. transformers/models/depth_pro/modeling_depth_pro.py +8 -4
  212. transformers/models/detr/configuration_detr.py +18 -49
  213. transformers/models/detr/image_processing_detr_fast.py +11 -11
  214. transformers/models/detr/modeling_detr.py +695 -734
  215. transformers/models/dia/configuration_dia.py +4 -7
  216. transformers/models/dia/generation_dia.py +8 -17
  217. transformers/models/dia/modeling_dia.py +7 -7
  218. transformers/models/dia/modular_dia.py +4 -4
  219. transformers/models/diffllama/configuration_diffllama.py +5 -7
  220. transformers/models/diffllama/modeling_diffllama.py +3 -8
  221. transformers/models/diffllama/modular_diffllama.py +2 -7
  222. transformers/models/dinat/configuration_dinat.py +2 -4
  223. transformers/models/dinat/modeling_dinat.py +7 -6
  224. transformers/models/dinov2/configuration_dinov2.py +2 -4
  225. transformers/models/dinov2/modeling_dinov2.py +9 -8
  226. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
  227. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
  228. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
  229. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
  230. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
  231. transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
  232. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
  233. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
  234. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
  235. transformers/models/distilbert/configuration_distilbert.py +8 -1
  236. transformers/models/distilbert/modeling_distilbert.py +3 -3
  237. transformers/models/doge/configuration_doge.py +17 -7
  238. transformers/models/doge/modeling_doge.py +4 -4
  239. transformers/models/doge/modular_doge.py +20 -10
  240. transformers/models/donut/image_processing_donut_fast.py +4 -4
  241. transformers/models/dots1/configuration_dots1.py +16 -7
  242. transformers/models/dots1/modeling_dots1.py +4 -4
  243. transformers/models/dpr/configuration_dpr.py +19 -1
  244. transformers/models/dpt/configuration_dpt.py +23 -65
  245. transformers/models/dpt/image_processing_dpt_fast.py +5 -5
  246. transformers/models/dpt/modeling_dpt.py +19 -15
  247. transformers/models/dpt/modular_dpt.py +4 -4
  248. transformers/models/edgetam/configuration_edgetam.py +1 -1
  249. transformers/models/edgetam/modeling_edgetam.py +53 -53
  250. transformers/models/edgetam/modular_edgetam.py +5 -7
  251. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
  252. transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
  253. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
  254. transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
  255. transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
  256. transformers/models/electra/configuration_electra.py +13 -2
  257. transformers/models/electra/modeling_electra.py +6 -6
  258. transformers/models/emu3/configuration_emu3.py +12 -10
  259. transformers/models/emu3/modeling_emu3.py +84 -47
  260. transformers/models/emu3/modular_emu3.py +77 -39
  261. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
  262. transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
  263. transformers/models/eomt/configuration_eomt.py +12 -13
  264. transformers/models/eomt/image_processing_eomt_fast.py +3 -3
  265. transformers/models/eomt/modeling_eomt.py +3 -3
  266. transformers/models/eomt/modular_eomt.py +17 -17
  267. transformers/models/eomt_dinov3/__init__.py +28 -0
  268. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  269. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  270. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  271. transformers/models/ernie/configuration_ernie.py +24 -2
  272. transformers/models/ernie/modeling_ernie.py +6 -30
  273. transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
  274. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  275. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
  276. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
  277. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
  278. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
  279. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
  280. transformers/models/esm/configuration_esm.py +9 -11
  281. transformers/models/esm/modeling_esm.py +3 -3
  282. transformers/models/esm/modeling_esmfold.py +1 -6
  283. transformers/models/esm/openfold_utils/protein.py +2 -3
  284. transformers/models/evolla/configuration_evolla.py +21 -8
  285. transformers/models/evolla/modeling_evolla.py +11 -7
  286. transformers/models/evolla/modular_evolla.py +5 -1
  287. transformers/models/exaone4/configuration_exaone4.py +8 -5
  288. transformers/models/exaone4/modeling_exaone4.py +4 -4
  289. transformers/models/exaone4/modular_exaone4.py +11 -8
  290. transformers/models/exaone_moe/__init__.py +27 -0
  291. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  292. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  293. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  294. transformers/models/falcon/configuration_falcon.py +9 -1
  295. transformers/models/falcon/modeling_falcon.py +3 -8
  296. transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
  297. transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
  298. transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
  299. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
  300. transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
  301. transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
  302. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
  303. transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
  304. transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
  305. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
  306. transformers/models/flaubert/configuration_flaubert.py +10 -4
  307. transformers/models/flaubert/modeling_flaubert.py +1 -1
  308. transformers/models/flava/configuration_flava.py +4 -3
  309. transformers/models/flava/image_processing_flava_fast.py +4 -4
  310. transformers/models/flava/modeling_flava.py +36 -28
  311. transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
  312. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
  313. transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
  314. transformers/models/florence2/configuration_florence2.py +4 -0
  315. transformers/models/florence2/modeling_florence2.py +57 -32
  316. transformers/models/florence2/modular_florence2.py +48 -26
  317. transformers/models/fnet/configuration_fnet.py +6 -1
  318. transformers/models/focalnet/configuration_focalnet.py +2 -4
  319. transformers/models/focalnet/modeling_focalnet.py +10 -7
  320. transformers/models/fsmt/configuration_fsmt.py +12 -16
  321. transformers/models/funnel/configuration_funnel.py +8 -0
  322. transformers/models/fuyu/configuration_fuyu.py +5 -8
  323. transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
  324. transformers/models/fuyu/modeling_fuyu.py +24 -23
  325. transformers/models/gemma/configuration_gemma.py +5 -7
  326. transformers/models/gemma/modeling_gemma.py +4 -4
  327. transformers/models/gemma/modular_gemma.py +5 -7
  328. transformers/models/gemma2/configuration_gemma2.py +5 -7
  329. transformers/models/gemma2/modeling_gemma2.py +4 -4
  330. transformers/models/gemma2/modular_gemma2.py +8 -10
  331. transformers/models/gemma3/configuration_gemma3.py +28 -22
  332. transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
  333. transformers/models/gemma3/modeling_gemma3.py +37 -33
  334. transformers/models/gemma3/modular_gemma3.py +46 -42
  335. transformers/models/gemma3n/configuration_gemma3n.py +35 -22
  336. transformers/models/gemma3n/modeling_gemma3n.py +86 -58
  337. transformers/models/gemma3n/modular_gemma3n.py +112 -75
  338. transformers/models/git/configuration_git.py +5 -7
  339. transformers/models/git/modeling_git.py +31 -41
  340. transformers/models/glm/configuration_glm.py +7 -9
  341. transformers/models/glm/modeling_glm.py +4 -4
  342. transformers/models/glm4/configuration_glm4.py +7 -9
  343. transformers/models/glm4/modeling_glm4.py +4 -4
  344. transformers/models/glm46v/configuration_glm46v.py +4 -0
  345. transformers/models/glm46v/image_processing_glm46v.py +5 -2
  346. transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
  347. transformers/models/glm46v/modeling_glm46v.py +91 -46
  348. transformers/models/glm46v/modular_glm46v.py +4 -0
  349. transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
  350. transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
  351. transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
  352. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
  353. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
  354. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
  355. transformers/models/glm4v/configuration_glm4v.py +12 -8
  356. transformers/models/glm4v/image_processing_glm4v.py +5 -2
  357. transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
  358. transformers/models/glm4v/modeling_glm4v.py +120 -63
  359. transformers/models/glm4v/modular_glm4v.py +82 -50
  360. transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
  361. transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
  362. transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
  363. transformers/models/glm_image/configuration_glm_image.py +26 -20
  364. transformers/models/glm_image/image_processing_glm_image.py +1 -1
  365. transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
  366. transformers/models/glm_image/modeling_glm_image.py +337 -236
  367. transformers/models/glm_image/modular_glm_image.py +415 -255
  368. transformers/models/glm_image/processing_glm_image.py +65 -17
  369. transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
  370. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  371. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  372. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  373. transformers/models/glmasr/modeling_glmasr.py +34 -28
  374. transformers/models/glmasr/modular_glmasr.py +23 -11
  375. transformers/models/glpn/image_processing_glpn_fast.py +3 -3
  376. transformers/models/glpn/modeling_glpn.py +4 -2
  377. transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
  378. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
  379. transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
  380. transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
  381. transformers/models/gpt2/configuration_gpt2.py +13 -1
  382. transformers/models/gpt2/modeling_gpt2.py +5 -5
  383. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
  384. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
  385. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
  386. transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
  387. transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
  388. transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
  389. transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
  390. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
  391. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
  392. transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
  393. transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
  394. transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
  395. transformers/models/gptj/configuration_gptj.py +4 -4
  396. transformers/models/gptj/modeling_gptj.py +3 -7
  397. transformers/models/granite/configuration_granite.py +5 -7
  398. transformers/models/granite/modeling_granite.py +4 -4
  399. transformers/models/granite_speech/modeling_granite_speech.py +63 -37
  400. transformers/models/granitemoe/configuration_granitemoe.py +5 -7
  401. transformers/models/granitemoe/modeling_granitemoe.py +4 -4
  402. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
  403. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
  404. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
  405. transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
  406. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
  407. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
  408. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
  409. transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
  410. transformers/models/groupvit/configuration_groupvit.py +4 -1
  411. transformers/models/groupvit/modeling_groupvit.py +29 -22
  412. transformers/models/helium/configuration_helium.py +5 -7
  413. transformers/models/helium/modeling_helium.py +4 -4
  414. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
  415. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
  416. transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
  417. transformers/models/hiera/configuration_hiera.py +2 -4
  418. transformers/models/hiera/modeling_hiera.py +11 -8
  419. transformers/models/hubert/configuration_hubert.py +4 -1
  420. transformers/models/hubert/modeling_hubert.py +7 -4
  421. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
  422. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
  423. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
  424. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
  425. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
  426. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
  427. transformers/models/ibert/configuration_ibert.py +4 -1
  428. transformers/models/idefics/configuration_idefics.py +5 -7
  429. transformers/models/idefics/modeling_idefics.py +3 -4
  430. transformers/models/idefics/vision.py +5 -4
  431. transformers/models/idefics2/configuration_idefics2.py +1 -2
  432. transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
  433. transformers/models/idefics2/modeling_idefics2.py +72 -50
  434. transformers/models/idefics3/configuration_idefics3.py +1 -3
  435. transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
  436. transformers/models/idefics3/modeling_idefics3.py +63 -40
  437. transformers/models/ijepa/modeling_ijepa.py +3 -3
  438. transformers/models/imagegpt/configuration_imagegpt.py +9 -1
  439. transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
  440. transformers/models/imagegpt/modeling_imagegpt.py +8 -4
  441. transformers/models/informer/modeling_informer.py +3 -3
  442. transformers/models/instructblip/configuration_instructblip.py +2 -1
  443. transformers/models/instructblip/modeling_instructblip.py +65 -39
  444. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
  445. transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
  446. transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
  447. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
  448. transformers/models/internvl/configuration_internvl.py +5 -0
  449. transformers/models/internvl/modeling_internvl.py +35 -55
  450. transformers/models/internvl/modular_internvl.py +26 -38
  451. transformers/models/internvl/video_processing_internvl.py +2 -2
  452. transformers/models/jais2/configuration_jais2.py +5 -7
  453. transformers/models/jais2/modeling_jais2.py +4 -4
  454. transformers/models/jamba/configuration_jamba.py +5 -7
  455. transformers/models/jamba/modeling_jamba.py +4 -4
  456. transformers/models/jamba/modular_jamba.py +3 -3
  457. transformers/models/janus/image_processing_janus.py +2 -2
  458. transformers/models/janus/image_processing_janus_fast.py +8 -8
  459. transformers/models/janus/modeling_janus.py +63 -146
  460. transformers/models/janus/modular_janus.py +62 -20
  461. transformers/models/jetmoe/configuration_jetmoe.py +6 -4
  462. transformers/models/jetmoe/modeling_jetmoe.py +3 -3
  463. transformers/models/jetmoe/modular_jetmoe.py +3 -3
  464. transformers/models/kosmos2/configuration_kosmos2.py +10 -8
  465. transformers/models/kosmos2/modeling_kosmos2.py +56 -34
  466. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
  467. transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
  468. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
  469. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
  470. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
  471. transformers/models/lasr/configuration_lasr.py +2 -4
  472. transformers/models/lasr/modeling_lasr.py +3 -3
  473. transformers/models/lasr/modular_lasr.py +3 -3
  474. transformers/models/layoutlm/configuration_layoutlm.py +14 -1
  475. transformers/models/layoutlm/modeling_layoutlm.py +3 -3
  476. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
  477. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
  478. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
  479. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
  480. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
  481. transformers/models/led/configuration_led.py +7 -8
  482. transformers/models/levit/image_processing_levit_fast.py +4 -4
  483. transformers/models/lfm2/configuration_lfm2.py +5 -7
  484. transformers/models/lfm2/modeling_lfm2.py +4 -4
  485. transformers/models/lfm2/modular_lfm2.py +3 -3
  486. transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
  487. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
  488. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  489. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
  490. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
  491. transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
  492. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
  493. transformers/models/lightglue/modeling_lightglue.py +3 -3
  494. transformers/models/lightglue/modular_lightglue.py +3 -3
  495. transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
  496. transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
  497. transformers/models/lilt/configuration_lilt.py +6 -1
  498. transformers/models/llama/configuration_llama.py +5 -7
  499. transformers/models/llama/modeling_llama.py +4 -4
  500. transformers/models/llama4/configuration_llama4.py +67 -47
  501. transformers/models/llama4/image_processing_llama4_fast.py +3 -3
  502. transformers/models/llama4/modeling_llama4.py +46 -44
  503. transformers/models/llava/configuration_llava.py +10 -0
  504. transformers/models/llava/image_processing_llava_fast.py +3 -3
  505. transformers/models/llava/modeling_llava.py +38 -65
  506. transformers/models/llava_next/configuration_llava_next.py +2 -1
  507. transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
  508. transformers/models/llava_next/modeling_llava_next.py +61 -60
  509. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
  510. transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
  511. transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
  512. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
  513. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
  514. transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
  515. transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
  516. transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
  517. transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
  518. transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
  519. transformers/models/longformer/configuration_longformer.py +4 -1
  520. transformers/models/longt5/configuration_longt5.py +9 -6
  521. transformers/models/longt5/modeling_longt5.py +2 -1
  522. transformers/models/luke/configuration_luke.py +8 -1
  523. transformers/models/lw_detr/configuration_lw_detr.py +19 -31
  524. transformers/models/lw_detr/modeling_lw_detr.py +43 -44
  525. transformers/models/lw_detr/modular_lw_detr.py +36 -38
  526. transformers/models/lxmert/configuration_lxmert.py +16 -0
  527. transformers/models/m2m_100/configuration_m2m_100.py +7 -8
  528. transformers/models/m2m_100/modeling_m2m_100.py +3 -3
  529. transformers/models/mamba/configuration_mamba.py +5 -2
  530. transformers/models/mamba/modeling_mamba.py +18 -26
  531. transformers/models/mamba2/configuration_mamba2.py +5 -7
  532. transformers/models/mamba2/modeling_mamba2.py +22 -33
  533. transformers/models/marian/configuration_marian.py +10 -4
  534. transformers/models/marian/modeling_marian.py +4 -4
  535. transformers/models/markuplm/configuration_markuplm.py +4 -6
  536. transformers/models/markuplm/modeling_markuplm.py +3 -3
  537. transformers/models/mask2former/configuration_mask2former.py +12 -47
  538. transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
  539. transformers/models/mask2former/modeling_mask2former.py +18 -12
  540. transformers/models/maskformer/configuration_maskformer.py +14 -45
  541. transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
  542. transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
  543. transformers/models/maskformer/modeling_maskformer.py +15 -9
  544. transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
  545. transformers/models/mbart/configuration_mbart.py +9 -4
  546. transformers/models/mbart/modeling_mbart.py +9 -6
  547. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
  548. transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
  549. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  550. transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
  551. transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
  552. transformers/models/mgp_str/modeling_mgp_str.py +4 -2
  553. transformers/models/mimi/configuration_mimi.py +4 -0
  554. transformers/models/mimi/modeling_mimi.py +40 -36
  555. transformers/models/minimax/configuration_minimax.py +8 -11
  556. transformers/models/minimax/modeling_minimax.py +5 -5
  557. transformers/models/minimax/modular_minimax.py +9 -12
  558. transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
  559. transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
  560. transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
  561. transformers/models/ministral/configuration_ministral.py +5 -7
  562. transformers/models/ministral/modeling_ministral.py +4 -4
  563. transformers/models/ministral/modular_ministral.py +5 -8
  564. transformers/models/ministral3/configuration_ministral3.py +4 -4
  565. transformers/models/ministral3/modeling_ministral3.py +4 -4
  566. transformers/models/ministral3/modular_ministral3.py +3 -3
  567. transformers/models/mistral/configuration_mistral.py +5 -7
  568. transformers/models/mistral/modeling_mistral.py +4 -4
  569. transformers/models/mistral/modular_mistral.py +3 -3
  570. transformers/models/mistral3/configuration_mistral3.py +4 -0
  571. transformers/models/mistral3/modeling_mistral3.py +36 -40
  572. transformers/models/mistral3/modular_mistral3.py +31 -32
  573. transformers/models/mixtral/configuration_mixtral.py +8 -11
  574. transformers/models/mixtral/modeling_mixtral.py +4 -4
  575. transformers/models/mlcd/modeling_mlcd.py +7 -5
  576. transformers/models/mlcd/modular_mlcd.py +7 -5
  577. transformers/models/mllama/configuration_mllama.py +5 -7
  578. transformers/models/mllama/image_processing_mllama_fast.py +6 -5
  579. transformers/models/mllama/modeling_mllama.py +19 -19
  580. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
  581. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
  582. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
  583. transformers/models/mobilebert/configuration_mobilebert.py +4 -1
  584. transformers/models/mobilebert/modeling_mobilebert.py +3 -3
  585. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
  586. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
  587. transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
  588. transformers/models/mobilevit/modeling_mobilevit.py +4 -2
  589. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
  590. transformers/models/modernbert/configuration_modernbert.py +46 -21
  591. transformers/models/modernbert/modeling_modernbert.py +146 -899
  592. transformers/models/modernbert/modular_modernbert.py +185 -908
  593. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
  594. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
  595. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
  596. transformers/models/moonshine/configuration_moonshine.py +12 -7
  597. transformers/models/moonshine/modeling_moonshine.py +7 -7
  598. transformers/models/moonshine/modular_moonshine.py +19 -13
  599. transformers/models/moshi/configuration_moshi.py +28 -2
  600. transformers/models/moshi/modeling_moshi.py +4 -9
  601. transformers/models/mpnet/configuration_mpnet.py +6 -1
  602. transformers/models/mpt/configuration_mpt.py +16 -0
  603. transformers/models/mra/configuration_mra.py +8 -1
  604. transformers/models/mt5/configuration_mt5.py +9 -5
  605. transformers/models/mt5/modeling_mt5.py +5 -8
  606. transformers/models/musicgen/configuration_musicgen.py +12 -7
  607. transformers/models/musicgen/modeling_musicgen.py +6 -5
  608. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
  609. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
  610. transformers/models/mvp/configuration_mvp.py +8 -4
  611. transformers/models/mvp/modeling_mvp.py +6 -4
  612. transformers/models/nanochat/configuration_nanochat.py +5 -7
  613. transformers/models/nanochat/modeling_nanochat.py +4 -4
  614. transformers/models/nanochat/modular_nanochat.py +4 -4
  615. transformers/models/nemotron/configuration_nemotron.py +5 -7
  616. transformers/models/nemotron/modeling_nemotron.py +4 -14
  617. transformers/models/nllb/tokenization_nllb.py +7 -5
  618. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
  619. transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
  620. transformers/models/nougat/image_processing_nougat_fast.py +8 -8
  621. transformers/models/nystromformer/configuration_nystromformer.py +8 -1
  622. transformers/models/olmo/configuration_olmo.py +5 -7
  623. transformers/models/olmo/modeling_olmo.py +4 -4
  624. transformers/models/olmo/modular_olmo.py +3 -3
  625. transformers/models/olmo2/configuration_olmo2.py +9 -11
  626. transformers/models/olmo2/modeling_olmo2.py +4 -4
  627. transformers/models/olmo2/modular_olmo2.py +7 -7
  628. transformers/models/olmo3/configuration_olmo3.py +10 -11
  629. transformers/models/olmo3/modeling_olmo3.py +4 -4
  630. transformers/models/olmo3/modular_olmo3.py +13 -14
  631. transformers/models/olmoe/configuration_olmoe.py +5 -7
  632. transformers/models/olmoe/modeling_olmoe.py +4 -4
  633. transformers/models/olmoe/modular_olmoe.py +3 -3
  634. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
  635. transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
  636. transformers/models/oneformer/configuration_oneformer.py +9 -46
  637. transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
  638. transformers/models/oneformer/modeling_oneformer.py +14 -9
  639. transformers/models/openai/configuration_openai.py +16 -0
  640. transformers/models/opt/configuration_opt.py +6 -6
  641. transformers/models/opt/modeling_opt.py +5 -5
  642. transformers/models/ovis2/configuration_ovis2.py +4 -0
  643. transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
  644. transformers/models/ovis2/modeling_ovis2.py +58 -99
  645. transformers/models/ovis2/modular_ovis2.py +52 -13
  646. transformers/models/owlv2/configuration_owlv2.py +4 -1
  647. transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
  648. transformers/models/owlv2/modeling_owlv2.py +40 -27
  649. transformers/models/owlv2/modular_owlv2.py +5 -5
  650. transformers/models/owlvit/configuration_owlvit.py +4 -1
  651. transformers/models/owlvit/modeling_owlvit.py +40 -27
  652. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
  653. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
  654. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
  655. transformers/models/paligemma/configuration_paligemma.py +4 -0
  656. transformers/models/paligemma/modeling_paligemma.py +30 -26
  657. transformers/models/parakeet/configuration_parakeet.py +2 -4
  658. transformers/models/parakeet/modeling_parakeet.py +3 -3
  659. transformers/models/parakeet/modular_parakeet.py +3 -3
  660. transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
  661. transformers/models/patchtst/modeling_patchtst.py +3 -3
  662. transformers/models/pe_audio/modeling_pe_audio.py +4 -4
  663. transformers/models/pe_audio/modular_pe_audio.py +1 -1
  664. transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
  665. transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
  666. transformers/models/pe_video/modeling_pe_video.py +36 -24
  667. transformers/models/pe_video/modular_pe_video.py +36 -23
  668. transformers/models/pegasus/configuration_pegasus.py +8 -5
  669. transformers/models/pegasus/modeling_pegasus.py +4 -4
  670. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
  671. transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
  672. transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
  673. transformers/models/perceiver/modeling_perceiver.py +17 -9
  674. transformers/models/perception_lm/modeling_perception_lm.py +26 -27
  675. transformers/models/perception_lm/modular_perception_lm.py +27 -25
  676. transformers/models/persimmon/configuration_persimmon.py +5 -7
  677. transformers/models/persimmon/modeling_persimmon.py +5 -5
  678. transformers/models/phi/configuration_phi.py +8 -6
  679. transformers/models/phi/modeling_phi.py +4 -4
  680. transformers/models/phi/modular_phi.py +3 -3
  681. transformers/models/phi3/configuration_phi3.py +9 -11
  682. transformers/models/phi3/modeling_phi3.py +4 -4
  683. transformers/models/phi3/modular_phi3.py +3 -3
  684. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
  685. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
  686. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
  687. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
  688. transformers/models/phimoe/configuration_phimoe.py +5 -7
  689. transformers/models/phimoe/modeling_phimoe.py +15 -39
  690. transformers/models/phimoe/modular_phimoe.py +12 -7
  691. transformers/models/pix2struct/configuration_pix2struct.py +12 -9
  692. transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
  693. transformers/models/pix2struct/modeling_pix2struct.py +14 -7
  694. transformers/models/pixio/configuration_pixio.py +2 -4
  695. transformers/models/pixio/modeling_pixio.py +9 -8
  696. transformers/models/pixio/modular_pixio.py +4 -2
  697. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
  698. transformers/models/pixtral/modeling_pixtral.py +9 -12
  699. transformers/models/plbart/configuration_plbart.py +8 -5
  700. transformers/models/plbart/modeling_plbart.py +9 -7
  701. transformers/models/plbart/modular_plbart.py +1 -1
  702. transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
  703. transformers/models/pop2piano/configuration_pop2piano.py +7 -6
  704. transformers/models/pop2piano/modeling_pop2piano.py +2 -1
  705. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  706. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  707. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  708. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  709. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  710. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
  711. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
  712. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
  713. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
  714. transformers/models/prophetnet/configuration_prophetnet.py +11 -10
  715. transformers/models/prophetnet/modeling_prophetnet.py +12 -23
  716. transformers/models/pvt/image_processing_pvt.py +7 -7
  717. transformers/models/pvt/image_processing_pvt_fast.py +1 -1
  718. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  719. transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
  720. transformers/models/qwen2/configuration_qwen2.py +14 -4
  721. transformers/models/qwen2/modeling_qwen2.py +4 -4
  722. transformers/models/qwen2/modular_qwen2.py +3 -3
  723. transformers/models/qwen2/tokenization_qwen2.py +0 -4
  724. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
  725. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
  726. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
  727. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
  728. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
  729. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
  730. transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
  731. transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
  732. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  733. transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
  734. transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
  735. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
  736. transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
  737. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
  738. transformers/models/qwen3/configuration_qwen3.py +15 -5
  739. transformers/models/qwen3/modeling_qwen3.py +4 -4
  740. transformers/models/qwen3/modular_qwen3.py +3 -3
  741. transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
  742. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  743. transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
  744. transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
  745. transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
  746. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
  747. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
  748. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
  749. transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
  750. transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
  751. transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
  752. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
  753. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
  754. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
  755. transformers/models/rag/configuration_rag.py +6 -6
  756. transformers/models/rag/modeling_rag.py +3 -3
  757. transformers/models/rag/retrieval_rag.py +1 -1
  758. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
  759. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
  760. transformers/models/reformer/configuration_reformer.py +7 -7
  761. transformers/models/rembert/configuration_rembert.py +8 -1
  762. transformers/models/rembert/modeling_rembert.py +0 -22
  763. transformers/models/resnet/configuration_resnet.py +2 -4
  764. transformers/models/resnet/modeling_resnet.py +6 -5
  765. transformers/models/roberta/configuration_roberta.py +11 -2
  766. transformers/models/roberta/modeling_roberta.py +6 -6
  767. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
  768. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
  769. transformers/models/roc_bert/configuration_roc_bert.py +8 -1
  770. transformers/models/roc_bert/modeling_roc_bert.py +6 -41
  771. transformers/models/roformer/configuration_roformer.py +13 -2
  772. transformers/models/roformer/modeling_roformer.py +0 -14
  773. transformers/models/rt_detr/configuration_rt_detr.py +8 -49
  774. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
  775. transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
  776. transformers/models/rt_detr/modeling_rt_detr.py +578 -737
  777. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
  778. transformers/models/rt_detr/modular_rt_detr.py +1508 -6
  779. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
  780. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
  781. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
  782. transformers/models/rwkv/configuration_rwkv.py +2 -3
  783. transformers/models/rwkv/modeling_rwkv.py +0 -23
  784. transformers/models/sam/configuration_sam.py +2 -0
  785. transformers/models/sam/image_processing_sam_fast.py +4 -4
  786. transformers/models/sam/modeling_sam.py +13 -8
  787. transformers/models/sam/processing_sam.py +3 -3
  788. transformers/models/sam2/configuration_sam2.py +1 -1
  789. transformers/models/sam2/modeling_sam2.py +56 -52
  790. transformers/models/sam2/modular_sam2.py +47 -55
  791. transformers/models/sam2_video/modeling_sam2_video.py +50 -51
  792. transformers/models/sam2_video/modular_sam2_video.py +12 -10
  793. transformers/models/sam3/modeling_sam3.py +43 -47
  794. transformers/models/sam3/processing_sam3.py +8 -4
  795. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
  796. transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
  797. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  798. transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
  799. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
  800. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
  801. transformers/models/sam3_video/modeling_sam3_video.py +27 -14
  802. transformers/models/sam_hq/configuration_sam_hq.py +2 -0
  803. transformers/models/sam_hq/modeling_sam_hq.py +13 -9
  804. transformers/models/sam_hq/modular_sam_hq.py +6 -6
  805. transformers/models/sam_hq/processing_sam_hq.py +7 -6
  806. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
  807. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
  808. transformers/models/seed_oss/configuration_seed_oss.py +7 -9
  809. transformers/models/seed_oss/modeling_seed_oss.py +4 -4
  810. transformers/models/seed_oss/modular_seed_oss.py +3 -3
  811. transformers/models/segformer/image_processing_segformer_fast.py +4 -4
  812. transformers/models/segformer/modeling_segformer.py +4 -2
  813. transformers/models/segformer/modular_segformer.py +3 -3
  814. transformers/models/seggpt/modeling_seggpt.py +20 -8
  815. transformers/models/sew/configuration_sew.py +4 -1
  816. transformers/models/sew/modeling_sew.py +9 -5
  817. transformers/models/sew/modular_sew.py +2 -1
  818. transformers/models/sew_d/configuration_sew_d.py +4 -1
  819. transformers/models/sew_d/modeling_sew_d.py +4 -1
  820. transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
  821. transformers/models/siglip/configuration_siglip.py +4 -1
  822. transformers/models/siglip/modeling_siglip.py +27 -71
  823. transformers/models/siglip2/__init__.py +1 -0
  824. transformers/models/siglip2/configuration_siglip2.py +4 -2
  825. transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
  826. transformers/models/siglip2/modeling_siglip2.py +37 -78
  827. transformers/models/siglip2/modular_siglip2.py +74 -25
  828. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  829. transformers/models/smollm3/configuration_smollm3.py +6 -6
  830. transformers/models/smollm3/modeling_smollm3.py +4 -4
  831. transformers/models/smollm3/modular_smollm3.py +9 -9
  832. transformers/models/smolvlm/configuration_smolvlm.py +1 -3
  833. transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
  834. transformers/models/smolvlm/modeling_smolvlm.py +75 -46
  835. transformers/models/smolvlm/modular_smolvlm.py +36 -23
  836. transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
  837. transformers/models/solar_open/__init__.py +27 -0
  838. transformers/models/solar_open/configuration_solar_open.py +184 -0
  839. transformers/models/solar_open/modeling_solar_open.py +642 -0
  840. transformers/models/solar_open/modular_solar_open.py +224 -0
  841. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
  842. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
  843. transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
  844. transformers/models/speecht5/configuration_speecht5.py +7 -8
  845. transformers/models/splinter/configuration_splinter.py +6 -6
  846. transformers/models/splinter/modeling_splinter.py +8 -3
  847. transformers/models/squeezebert/configuration_squeezebert.py +14 -1
  848. transformers/models/stablelm/configuration_stablelm.py +8 -6
  849. transformers/models/stablelm/modeling_stablelm.py +5 -5
  850. transformers/models/starcoder2/configuration_starcoder2.py +11 -5
  851. transformers/models/starcoder2/modeling_starcoder2.py +5 -5
  852. transformers/models/starcoder2/modular_starcoder2.py +4 -4
  853. transformers/models/superglue/configuration_superglue.py +4 -0
  854. transformers/models/superglue/image_processing_superglue_fast.py +4 -3
  855. transformers/models/superglue/modeling_superglue.py +9 -4
  856. transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
  857. transformers/models/superpoint/modeling_superpoint.py +4 -2
  858. transformers/models/swin/configuration_swin.py +2 -4
  859. transformers/models/swin/modeling_swin.py +11 -8
  860. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
  861. transformers/models/swin2sr/modeling_swin2sr.py +4 -2
  862. transformers/models/swinv2/configuration_swinv2.py +2 -4
  863. transformers/models/swinv2/modeling_swinv2.py +10 -7
  864. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
  865. transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
  866. transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
  867. transformers/models/t5/configuration_t5.py +9 -8
  868. transformers/models/t5/modeling_t5.py +5 -8
  869. transformers/models/t5gemma/configuration_t5gemma.py +10 -25
  870. transformers/models/t5gemma/modeling_t5gemma.py +9 -9
  871. transformers/models/t5gemma/modular_t5gemma.py +11 -24
  872. transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
  873. transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
  874. transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
  875. transformers/models/table_transformer/configuration_table_transformer.py +18 -49
  876. transformers/models/table_transformer/modeling_table_transformer.py +27 -53
  877. transformers/models/tapas/configuration_tapas.py +12 -1
  878. transformers/models/tapas/modeling_tapas.py +1 -1
  879. transformers/models/tapas/tokenization_tapas.py +1 -0
  880. transformers/models/textnet/configuration_textnet.py +4 -6
  881. transformers/models/textnet/image_processing_textnet_fast.py +3 -3
  882. transformers/models/textnet/modeling_textnet.py +15 -14
  883. transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
  884. transformers/models/timesfm/modeling_timesfm.py +5 -6
  885. transformers/models/timesfm/modular_timesfm.py +5 -6
  886. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
  887. transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
  888. transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
  889. transformers/models/trocr/configuration_trocr.py +11 -7
  890. transformers/models/trocr/modeling_trocr.py +4 -2
  891. transformers/models/tvp/configuration_tvp.py +10 -35
  892. transformers/models/tvp/image_processing_tvp_fast.py +6 -5
  893. transformers/models/tvp/modeling_tvp.py +1 -1
  894. transformers/models/udop/configuration_udop.py +16 -7
  895. transformers/models/udop/modeling_udop.py +10 -6
  896. transformers/models/umt5/configuration_umt5.py +8 -6
  897. transformers/models/umt5/modeling_umt5.py +7 -3
  898. transformers/models/unispeech/configuration_unispeech.py +4 -1
  899. transformers/models/unispeech/modeling_unispeech.py +7 -4
  900. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
  901. transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
  902. transformers/models/upernet/configuration_upernet.py +8 -35
  903. transformers/models/upernet/modeling_upernet.py +1 -1
  904. transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
  905. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  906. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  907. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
  908. transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
  909. transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
  910. transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
  911. transformers/models/video_llava/configuration_video_llava.py +4 -0
  912. transformers/models/video_llava/modeling_video_llava.py +87 -89
  913. transformers/models/videomae/modeling_videomae.py +4 -5
  914. transformers/models/vilt/configuration_vilt.py +4 -1
  915. transformers/models/vilt/image_processing_vilt_fast.py +6 -6
  916. transformers/models/vilt/modeling_vilt.py +27 -12
  917. transformers/models/vipllava/configuration_vipllava.py +4 -0
  918. transformers/models/vipllava/modeling_vipllava.py +57 -31
  919. transformers/models/vipllava/modular_vipllava.py +50 -24
  920. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
  921. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
  922. transformers/models/visual_bert/configuration_visual_bert.py +6 -1
  923. transformers/models/vit/configuration_vit.py +2 -2
  924. transformers/models/vit/modeling_vit.py +7 -5
  925. transformers/models/vit_mae/modeling_vit_mae.py +11 -7
  926. transformers/models/vit_msn/modeling_vit_msn.py +11 -7
  927. transformers/models/vitdet/configuration_vitdet.py +2 -4
  928. transformers/models/vitdet/modeling_vitdet.py +2 -3
  929. transformers/models/vitmatte/configuration_vitmatte.py +6 -35
  930. transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
  931. transformers/models/vitmatte/modeling_vitmatte.py +1 -1
  932. transformers/models/vitpose/configuration_vitpose.py +6 -43
  933. transformers/models/vitpose/modeling_vitpose.py +5 -3
  934. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
  935. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
  936. transformers/models/vits/configuration_vits.py +4 -0
  937. transformers/models/vits/modeling_vits.py +9 -7
  938. transformers/models/vivit/modeling_vivit.py +4 -4
  939. transformers/models/vjepa2/modeling_vjepa2.py +9 -9
  940. transformers/models/voxtral/configuration_voxtral.py +0 -1
  941. transformers/models/voxtral/modeling_voxtral.py +25 -24
  942. transformers/models/voxtral/modular_voxtral.py +26 -20
  943. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
  944. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
  945. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
  946. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
  947. transformers/models/wavlm/configuration_wavlm.py +4 -1
  948. transformers/models/wavlm/modeling_wavlm.py +4 -1
  949. transformers/models/whisper/configuration_whisper.py +6 -4
  950. transformers/models/whisper/generation_whisper.py +0 -1
  951. transformers/models/whisper/modeling_whisper.py +3 -3
  952. transformers/models/x_clip/configuration_x_clip.py +4 -1
  953. transformers/models/x_clip/modeling_x_clip.py +26 -27
  954. transformers/models/xglm/configuration_xglm.py +9 -7
  955. transformers/models/xlm/configuration_xlm.py +10 -7
  956. transformers/models/xlm/modeling_xlm.py +1 -1
  957. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
  958. transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
  959. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
  960. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
  961. transformers/models/xlnet/configuration_xlnet.py +3 -1
  962. transformers/models/xlstm/configuration_xlstm.py +5 -7
  963. transformers/models/xlstm/modeling_xlstm.py +0 -32
  964. transformers/models/xmod/configuration_xmod.py +11 -2
  965. transformers/models/xmod/modeling_xmod.py +13 -16
  966. transformers/models/yolos/image_processing_yolos_fast.py +25 -28
  967. transformers/models/yolos/modeling_yolos.py +7 -7
  968. transformers/models/yolos/modular_yolos.py +16 -16
  969. transformers/models/yoso/configuration_yoso.py +8 -1
  970. transformers/models/youtu/__init__.py +27 -0
  971. transformers/models/youtu/configuration_youtu.py +194 -0
  972. transformers/models/youtu/modeling_youtu.py +619 -0
  973. transformers/models/youtu/modular_youtu.py +254 -0
  974. transformers/models/zamba/configuration_zamba.py +5 -7
  975. transformers/models/zamba/modeling_zamba.py +25 -56
  976. transformers/models/zamba2/configuration_zamba2.py +8 -13
  977. transformers/models/zamba2/modeling_zamba2.py +53 -78
  978. transformers/models/zamba2/modular_zamba2.py +36 -29
  979. transformers/models/zoedepth/configuration_zoedepth.py +17 -40
  980. transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
  981. transformers/models/zoedepth/modeling_zoedepth.py +5 -3
  982. transformers/pipelines/__init__.py +1 -61
  983. transformers/pipelines/any_to_any.py +1 -1
  984. transformers/pipelines/automatic_speech_recognition.py +0 -2
  985. transformers/pipelines/base.py +1 -1
  986. transformers/pipelines/image_text_to_text.py +1 -1
  987. transformers/pipelines/text_to_audio.py +5 -1
  988. transformers/processing_utils.py +35 -44
  989. transformers/pytorch_utils.py +2 -26
  990. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  991. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  992. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  993. transformers/quantizers/quantizer_mxfp4.py +1 -1
  994. transformers/quantizers/quantizer_torchao.py +0 -16
  995. transformers/safetensors_conversion.py +11 -4
  996. transformers/testing_utils.py +3 -28
  997. transformers/tokenization_mistral_common.py +9 -0
  998. transformers/tokenization_python.py +6 -4
  999. transformers/tokenization_utils_base.py +119 -219
  1000. transformers/tokenization_utils_tokenizers.py +31 -2
  1001. transformers/trainer.py +25 -33
  1002. transformers/trainer_seq2seq.py +1 -1
  1003. transformers/training_args.py +411 -417
  1004. transformers/utils/__init__.py +1 -4
  1005. transformers/utils/auto_docstring.py +15 -18
  1006. transformers/utils/backbone_utils.py +13 -373
  1007. transformers/utils/doc.py +4 -36
  1008. transformers/utils/generic.py +69 -33
  1009. transformers/utils/import_utils.py +72 -75
  1010. transformers/utils/loading_report.py +133 -105
  1011. transformers/utils/quantization_config.py +0 -21
  1012. transformers/video_processing_utils.py +5 -5
  1013. transformers/video_utils.py +3 -1
  1014. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
  1015. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
  1016. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1017. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1018. transformers/pipelines/image_to_text.py +0 -189
  1019. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1020. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1021. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -36,7 +36,7 @@ from ...modeling_rope_utils import (
36
36
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
37
37
  from ...processing_utils import Unpack
38
38
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
39
- from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
39
+ from ...utils.generic import OutputRecorder, check_model_inputs, is_flash_attention_requested, maybe_autocast
40
40
  from .configuration_mllama import MllamaConfig, MllamaTextConfig, MllamaVisionConfig
41
41
 
42
42
 
@@ -252,10 +252,9 @@ class MllamaVisionAttention(nn.Module):
252
252
  key = key.view(batch_size, kv_seq_len, self.num_heads, self.head_dim).transpose(1, 2)
253
253
  value = value.view(batch_size, kv_seq_len, self.num_heads, self.head_dim).transpose(1, 2)
254
254
 
255
- attention_interface: Callable = eager_attention_forward
256
-
257
- if self.config._attn_implementation != "eager":
258
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
255
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
256
+ self.config._attn_implementation, eager_attention_forward
257
+ )
259
258
 
260
259
  attn_output, attn_weights = attention_interface(
261
260
  self,
@@ -451,10 +450,9 @@ class MllamaTextCrossAttention(nn.Module):
451
450
  "Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!"
452
451
  )
453
452
 
454
- attention_interface: Callable = eager_attention_forward
455
-
456
- if self.config._attn_implementation != "eager":
457
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
453
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
454
+ self.config._attn_implementation, eager_attention_forward
455
+ )
458
456
 
459
457
  attn_output, attn_weights = attention_interface(
460
458
  self,
@@ -554,10 +552,9 @@ class MllamaTextSelfAttention(nn.Module):
554
552
  cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
555
553
  key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
556
554
 
557
- attention_interface: Callable = eager_attention_forward
558
-
559
- if self.config._attn_implementation != "eager":
560
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
555
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
556
+ self.config._attn_implementation, eager_attention_forward
557
+ )
561
558
 
562
559
  attn_output, attn_weights = attention_interface(
563
560
  self,
@@ -863,7 +860,7 @@ class MllamaPreTrainedModel(PreTrainedModel):
863
860
  past_key_values: Cache,
864
861
  output_attentions: bool = False,
865
862
  ):
866
- if self.config._attn_implementation == "flash_attention_2":
863
+ if is_flash_attention_requested(self.config):
867
864
  if attention_mask is not None and (attention_mask == 0.0).any():
868
865
  return attention_mask
869
866
  return None
@@ -1067,7 +1064,8 @@ class MllamaVisionModel(MllamaPreTrainedModel):
1067
1064
 
1068
1065
  ```python
1069
1066
  >>> from PIL import Image
1070
- >>> import requests
1067
+ >>> import httpx
1068
+ >>> from io import BytesIO
1071
1069
  >>> from transformers import AutoProcessor, MllamaVisionModel
1072
1070
 
1073
1071
  >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
@@ -1075,7 +1073,8 @@ class MllamaVisionModel(MllamaPreTrainedModel):
1075
1073
  >>> processor = AutoProcessor.from_pretrained(checkpoint)
1076
1074
 
1077
1075
  >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
1078
- >>> image = Image.open(requests.get(url, stream=True).raw)
1076
+ >>> with httpx.stream("GET", url) as response:
1077
+ ... image = Image.open(BytesIO(response.read()))
1079
1078
  >>> inputs = processor(images=image, return_tensors="pt")
1080
1079
 
1081
1080
  >>> output = model(**inputs)
@@ -1454,7 +1453,6 @@ class MllamaModel(MllamaPreTrainedModel):
1454
1453
  self.hidden_size = config.text_config.hidden_size
1455
1454
  self.max_num_tiles = config.vision_config.max_num_tiles
1456
1455
  self.vision_output_dim = config.vision_config.vision_output_dim
1457
- self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
1458
1456
 
1459
1457
  self.vision_model = MllamaVisionModel._from_config(config.vision_config)
1460
1458
  self.language_model = MllamaTextModel._from_config(config.text_config)
@@ -1657,7 +1655,8 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
1657
1655
 
1658
1656
  ```python
1659
1657
  >>> from PIL import Image
1660
- >>> import requests
1658
+ >>> import httpx
1659
+ >>> from io import BytesIO
1661
1660
  >>> from transformers import AutoProcessor, MllamaForConditionalGeneration
1662
1661
 
1663
1662
  >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
@@ -1666,7 +1665,8 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
1666
1665
 
1667
1666
  >>> prompt = "<|image|>If I had to write a haiku for this one"
1668
1667
  >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
1669
- >>> image = Image.open(requests.get(url, stream=True).raw)
1668
+ >>> with httpx.stream("GET", url) as response:
1669
+ ... image = Image.open(BytesIO(response.read()))
1670
1670
 
1671
1671
  >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
1672
1672
 
@@ -17,9 +17,9 @@
17
17
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
18
  # See the License for the specific language governing permissions and
19
19
  # limitations under the License.
20
+ from ...backbone_utils import consolidate_backbone_kwargs_to_config
20
21
  from ...configuration_utils import PreTrainedConfig
21
22
  from ...utils import logging
22
- from ...utils.backbone_utils import verify_backbone_config_arguments
23
23
  from ..auto import CONFIG_MAPPING, AutoConfig
24
24
 
25
25
 
@@ -39,18 +39,6 @@ class MMGroundingDinoConfig(PreTrainedConfig):
39
39
  Args:
40
40
  backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
41
41
  The configuration of the backbone model.
42
- backbone (`str`, *optional*):
43
- Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
44
- will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
45
- is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
46
- use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
47
- Whether to use pretrained weights for the backbone.
48
- use_timm_backbone (`bool`, *optional*, defaults to `False`):
49
- Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
50
- library.
51
- backbone_kwargs (`dict`, *optional*):
52
- Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
53
- e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
54
42
  text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `BertConfig`):
55
43
  The config object or dictionary of the text backbone.
56
44
  num_queries (`int`, *optional*, defaults to 900):
@@ -127,6 +115,8 @@ class MMGroundingDinoConfig(PreTrainedConfig):
127
115
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
128
116
  layer_norm_eps (`float`, *optional*, defaults to 1e-05):
129
117
  The epsilon used by the layer normalization layers.
118
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
119
+ Whether to tie weight embeddings
130
120
 
131
121
  Examples:
132
122
 
@@ -153,10 +143,6 @@ class MMGroundingDinoConfig(PreTrainedConfig):
153
143
  def __init__(
154
144
  self,
155
145
  backbone_config=None,
156
- backbone=None,
157
- use_pretrained_backbone=False,
158
- use_timm_backbone=False,
159
- backbone_kwargs=None,
160
146
  text_config=None,
161
147
  num_queries=900,
162
148
  encoder_layers=6,
@@ -194,40 +180,17 @@ class MMGroundingDinoConfig(PreTrainedConfig):
194
180
  positional_embedding_temperature=20,
195
181
  init_std=0.02,
196
182
  layer_norm_eps=1e-5,
183
+ tie_word_embeddings=True,
197
184
  **kwargs,
198
185
  ):
199
- if backbone_config is None and backbone is None:
200
- logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.")
201
- backbone_config = CONFIG_MAPPING["swin"](
202
- window_size=7,
203
- image_size=224,
204
- embed_dim=96,
205
- depths=[2, 2, 6, 2],
206
- num_heads=[3, 6, 12, 24],
207
- out_indices=[2, 3, 4],
208
- )
209
- elif isinstance(backbone_config, dict):
210
- backbone_model_type = backbone_config.pop("model_type")
211
- config_class = CONFIG_MAPPING[backbone_model_type]
212
- backbone_config = config_class.from_dict(backbone_config)
213
-
214
- verify_backbone_config_arguments(
215
- use_timm_backbone=use_timm_backbone,
216
- use_pretrained_backbone=use_pretrained_backbone,
217
- backbone=backbone,
186
+ backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
218
187
  backbone_config=backbone_config,
219
- backbone_kwargs=backbone_kwargs,
188
+ default_config_type="swin",
189
+ default_config_kwargs={"out_indices": [2, 3, 4]},
190
+ **kwargs,
220
191
  )
221
192
 
222
- if text_config is None:
223
- text_config = {}
224
- logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).")
225
-
226
193
  self.backbone_config = backbone_config
227
- self.backbone = backbone
228
- self.use_pretrained_backbone = use_pretrained_backbone
229
- self.use_timm_backbone = use_timm_backbone
230
- self.backbone_kwargs = backbone_kwargs
231
194
  self.num_queries = num_queries
232
195
  self.d_model = d_model
233
196
  self.encoder_ffn_dim = encoder_ffn_dim
@@ -261,6 +224,7 @@ class MMGroundingDinoConfig(PreTrainedConfig):
261
224
  text_config["model_type"] = text_config.get("model_type", "bert")
262
225
  text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
263
226
  elif text_config is None:
227
+ logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).")
264
228
  text_config = CONFIG_MAPPING["bert"]()
265
229
 
266
230
  self.text_config = text_config
@@ -277,6 +241,7 @@ class MMGroundingDinoConfig(PreTrainedConfig):
277
241
  self.positional_embedding_temperature = positional_embedding_temperature
278
242
  self.init_std = init_std
279
243
  self.layer_norm_eps = layer_norm_eps
244
+ self.tie_word_embeddings = tie_word_embeddings
280
245
 
281
246
  super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
282
247
 
@@ -27,20 +27,16 @@ from torch import Tensor, nn
27
27
 
28
28
  from ... import initialization as init
29
29
  from ...activations import ACT2FN
30
- from ...file_utils import ModelOutput, is_timm_available, requires_backends
30
+ from ...backbone_utils import load_backbone
31
+ from ...file_utils import ModelOutput
31
32
  from ...integrations import use_kernel_forward_from_hub
32
33
  from ...modeling_utils import PreTrainedModel
33
34
  from ...pytorch_utils import meshgrid
34
- from ...utils import auto_docstring
35
- from ...utils.backbone_utils import load_backbone
35
+ from ...utils import auto_docstring, torch_compilable_check
36
36
  from ..auto.modeling_auto import AutoModel
37
37
  from .configuration_mm_grounding_dino import MMGroundingDinoConfig
38
38
 
39
39
 
40
- if is_timm_available():
41
- from timm import create_model
42
-
43
-
44
40
  class MMGroundingDinoContrastiveEmbedding(nn.Module):
45
41
  def __init__(self, config):
46
42
  super().__init__()
@@ -182,9 +178,6 @@ class MMGroundingDinoMultiscaleDeformableAttention(nn.Module):
182
178
 
183
179
  self.disable_custom_kernels = config.disable_custom_kernels
184
180
 
185
- def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Tensor | None):
186
- return tensor if position_embeddings is None else tensor + position_embeddings
187
-
188
181
  def forward(
189
182
  self,
190
183
  hidden_states: torch.Tensor,
@@ -200,15 +193,15 @@ class MMGroundingDinoMultiscaleDeformableAttention(nn.Module):
200
193
  ):
201
194
  # add position embeddings to the hidden states before projecting to queries and keys
202
195
  if position_embeddings is not None:
203
- hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
196
+ hidden_states = hidden_states + position_embeddings
204
197
 
205
198
  batch_size, num_queries, _ = hidden_states.shape
206
199
  batch_size, sequence_length, _ = encoder_hidden_states.shape
207
200
  # Ignore copy
208
- if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
209
- raise ValueError(
210
- "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
211
- )
201
+ torch_compilable_check(
202
+ (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == sequence_length,
203
+ "Make sure to align the spatial shapes with the sequence length of the encoder hidden states",
204
+ )
212
205
 
213
206
  value = self.value_proj(encoder_hidden_states)
214
207
  if attention_mask is not None:
@@ -654,46 +647,23 @@ class MMGroundingDinoConvEncoder(nn.Module):
654
647
  super().__init__()
655
648
 
656
649
  self.config = config
657
-
658
- if config.use_timm_backbone:
659
- requires_backends(self, ["timm"])
660
- backbone = create_model(
661
- config.backbone,
662
- pretrained=config.use_pretrained_backbone,
663
- features_only=True,
664
- **config.backbone_kwargs,
665
- )
666
- else:
667
- backbone = load_backbone(config)
650
+ backbone = load_backbone(config)
668
651
 
669
652
  # replace batch norm by frozen batch norm
670
653
  with torch.no_grad():
671
654
  replace_batch_norm(backbone)
672
655
  self.model = backbone
673
- self.intermediate_channel_sizes = (
674
- self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
675
- )
676
-
677
- backbone_model_type = None
678
- if config.backbone is not None:
679
- backbone_model_type = config.backbone
680
- elif config.backbone_config is not None:
681
- backbone_model_type = config.backbone_config.model_type
682
- else:
683
- raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
656
+ self.intermediate_channel_sizes = self.model.channels
684
657
 
658
+ backbone_model_type = config.backbone_config.model_type
685
659
  if "resnet" in backbone_model_type:
686
660
  for name, parameter in self.model.named_parameters():
687
- if config.use_timm_backbone:
688
- if "layer2" not in name and "layer3" not in name and "layer4" not in name:
689
- parameter.requires_grad_(False)
690
- else:
691
- if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
692
- parameter.requires_grad_(False)
661
+ if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
662
+ parameter.requires_grad_(False)
693
663
 
694
664
  def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
695
665
  # send pixel_values through the model to get list of feature maps
696
- features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
666
+ features = self.model(pixel_values, return_dict=True).feature_maps
697
667
 
698
668
  out = []
699
669
  for feature_map in features:
@@ -703,6 +673,7 @@ class MMGroundingDinoConvEncoder(nn.Module):
703
673
  return out
704
674
 
705
675
 
676
+ # TODO: use modular - Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->MMGroundingDino
706
677
  class MMGroundingDinoConvModel(nn.Module):
707
678
  """
708
679
  This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
@@ -1131,12 +1102,12 @@ class MMGroundingDinoEncoder(MMGroundingDinoPreTrainedModel):
1131
1102
  self.post_init()
1132
1103
 
1133
1104
  @staticmethod
1134
- def get_reference_points(spatial_shapes, valid_ratios, device):
1105
+ def get_reference_points(spatial_shapes_list, valid_ratios, device):
1135
1106
  """
1136
1107
  Get reference points for each feature map.
1137
1108
 
1138
1109
  Args:
1139
- spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
1110
+ spatial_shapes_list (`list[tuple[int, int]]`):
1140
1111
  Spatial shapes of each feature map.
1141
1112
  valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
1142
1113
  Valid ratios of each feature map.
@@ -1146,7 +1117,7 @@ class MMGroundingDinoEncoder(MMGroundingDinoPreTrainedModel):
1146
1117
  `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
1147
1118
  """
1148
1119
  reference_points_list = []
1149
- for level, (height, width) in enumerate(spatial_shapes):
1120
+ for level, (height, width) in enumerate(spatial_shapes_list):
1150
1121
  ref_y, ref_x = meshgrid(
1151
1122
  torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
1152
1123
  torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
@@ -1229,7 +1200,7 @@ class MMGroundingDinoEncoder(MMGroundingDinoPreTrainedModel):
1229
1200
  )
1230
1201
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1231
1202
 
1232
- reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device)
1203
+ reference_points = self.get_reference_points(spatial_shapes_list, valid_ratios, device=vision_features.device)
1233
1204
 
1234
1205
  encoder_vision_states = () if output_hidden_states else None
1235
1206
  encoder_text_states = () if output_hidden_states else None
@@ -1783,33 +1754,42 @@ def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTen
1783
1754
  - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`)
1784
1755
  - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`)
1785
1756
  """
1786
- batch_size, num_token = input_ids.shape
1787
- # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens
1788
- special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool()
1789
- for special_token in SPECIAL_TOKENS:
1790
- special_tokens_mask = torch.logical_or(special_tokens_mask, input_ids == special_token)
1791
-
1792
- # idxs: each row is a list of indices of special tokens
1793
- idxs = torch.nonzero(special_tokens_mask)
1794
-
1795
- # generate attention mask and positional ids
1796
- attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1)
1797
- position_ids = torch.zeros((batch_size, num_token), device=input_ids.device)
1798
- previous_col = 0
1799
- for i in range(idxs.shape[0]):
1800
- row, col = idxs[i]
1801
- if (col == 0) or (col == num_token - 1):
1802
- attention_mask[row, col, col] = True
1803
- position_ids[row, col] = 0
1804
- else:
1805
- attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
1806
- position_ids[row, previous_col + 1 : col + 1] = torch.arange(
1807
- 0, col - previous_col, device=input_ids.device
1808
- )
1757
+ batch_size, seq_len = input_ids.shape
1758
+ device = input_ids.device
1759
+
1760
+ # Identify special token positions
1761
+ special_mask = torch.isin(input_ids, torch.tensor(SPECIAL_TOKENS, device=device))
1762
+
1763
+ # For each position, find the previous and next special token indices
1764
+ indices = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1)
1809
1765
 
1810
- previous_col = col
1766
+ # Previous special token: cummax of special token indices
1767
+ prev_special = torch.where(special_mask, indices, torch.tensor(-1, device=device))
1768
+ prev_special = torch.cummax(prev_special, dim=1)[0]
1811
1769
 
1812
- return attention_mask, position_ids.to(torch.long)
1770
+ # Next special token: flip, cummin, flip back
1771
+ next_special = torch.where(special_mask, indices, torch.tensor(seq_len, device=device))
1772
+ next_special = torch.flip(torch.cummin(torch.flip(next_special, dims=[1]), dim=1)[0], dims=[1])
1773
+
1774
+ # Tokens with the same next_special belong to the same block
1775
+ # Exclude blocks whose closing delimiter is at position 0 or seq_len-1
1776
+ valid_block = (next_special != 0) & (next_special != seq_len - 1) & (next_special != seq_len)
1777
+
1778
+ # Build attention mask: tokens attend to each other if they share the same next_special
1779
+ next_i = next_special.unsqueeze(2) # (B, N, 1)
1780
+ next_j = next_special.unsqueeze(1) # (B, 1, N)
1781
+ attention_mask = (next_i == next_j) & valid_block.unsqueeze(1)
1782
+
1783
+ # Always allow self-attention
1784
+ identity = torch.eye(seq_len, device=device, dtype=torch.bool).unsqueeze(0).expand(batch_size, -1, -1)
1785
+ attention_mask = identity | attention_mask
1786
+
1787
+ # Position IDs: distance from previous special token
1788
+ position_ids = indices - prev_special - 1
1789
+ position_ids = torch.where(valid_block, position_ids, torch.zeros_like(position_ids))
1790
+ position_ids = torch.clamp(position_ids, min=0).to(torch.long)
1791
+
1792
+ return attention_mask, position_ids
1813
1793
 
1814
1794
 
1815
1795
  @auto_docstring(
@@ -1888,13 +1868,13 @@ class MMGroundingDinoModel(MMGroundingDinoPreTrainedModel):
1888
1868
  valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
1889
1869
  return valid_ratio
1890
1870
 
1891
- def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
1871
+ def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes_list):
1892
1872
  """Generate the encoder output proposals from encoded enc_output.
1893
1873
 
1894
1874
  Args:
1895
1875
  enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder.
1896
1876
  padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`.
1897
- spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps.
1877
+ spatial_shapes_list (`list[tuple[int, int]]`): Spatial shapes of each feature map.
1898
1878
 
1899
1879
  Returns:
1900
1880
  `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
@@ -1906,7 +1886,7 @@ class MMGroundingDinoModel(MMGroundingDinoPreTrainedModel):
1906
1886
  batch_size = enc_output.shape[0]
1907
1887
  proposals = []
1908
1888
  current_position = 0
1909
- for level, (height, width) in enumerate(spatial_shapes):
1889
+ for level, (height, width) in enumerate(spatial_shapes_list):
1910
1890
  mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)]
1911
1891
  mask_flatten_ = mask_flatten_.view(batch_size, height, width, 1)
1912
1892
  valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
@@ -1970,10 +1950,12 @@ class MMGroundingDinoModel(MMGroundingDinoPreTrainedModel):
1970
1950
  ```python
1971
1951
  >>> from transformers import AutoProcessor, AutoModel
1972
1952
  >>> from PIL import Image
1973
- >>> import requests
1953
+ >>> import httpx
1954
+ >>> from io import BytesIO
1974
1955
 
1975
1956
  >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1976
- >>> image = Image.open(requests.get(url, stream=True).raw)
1957
+ >>> with httpx.stream("GET", url) as response:
1958
+ ... image = Image.open(BytesIO(response.read()))
1977
1959
  >>> text = "a cat."
1978
1960
 
1979
1961
  >>> processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
@@ -2121,7 +2103,7 @@ class MMGroundingDinoModel(MMGroundingDinoPreTrainedModel):
2121
2103
  encoder_pred_boxes = None
2122
2104
  if self.config.two_stage:
2123
2105
  object_query_embedding, output_proposals = self.generate_encoder_output_proposals(
2124
- encoder_outputs[0], ~mask_flatten, spatial_shapes
2106
+ encoder_outputs[0], ~mask_flatten, spatial_shapes_list
2125
2107
  )
2126
2108
 
2127
2109
  # hack implementation as in two-stage Deformable DETR
@@ -2218,8 +2200,6 @@ class MMGroundingDinoMLPPredictionHead(nn.Module):
2218
2200
  Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
2219
2201
  height and width of a bounding box w.r.t. an image.
2220
2202
 
2221
- Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
2222
-
2223
2203
  """
2224
2204
 
2225
2205
  def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
@@ -2454,7 +2434,8 @@ class MMGroundingDinoForObjectDetection(MMGroundingDinoPreTrainedModel):
2454
2434
  Examples:
2455
2435
 
2456
2436
  ```python
2457
- >>> import requests
2437
+ >>> import httpx
2438
+ >>> from io import BytesIO
2458
2439
 
2459
2440
  >>> import torch
2460
2441
  >>> from PIL import Image
@@ -2466,8 +2447,9 @@ class MMGroundingDinoForObjectDetection(MMGroundingDinoPreTrainedModel):
2466
2447
  >>> processor = AutoProcessor.from_pretrained(model_id)
2467
2448
  >>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
2468
2449
 
2469
- >>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
2470
- >>> image = Image.open(requests.get(image_url, stream=True).raw)
2450
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
2451
+ >>> with httpx.stream("GET", url) as response:
2452
+ ... image = Image.open(BytesIO(response.read()))
2471
2453
  >>> # Check for cats and remote controls
2472
2454
  >>> text_labels = [["a cat", "a remote control"]]
2473
2455
 
@@ -17,9 +17,9 @@ import torch
17
17
  from torch import nn
18
18
 
19
19
  from ... import initialization as init
20
+ from ...backbone_utils import consolidate_backbone_kwargs_to_config
20
21
  from ...configuration_utils import PreTrainedConfig
21
22
  from ...utils import logging
22
- from ...utils.backbone_utils import verify_backbone_config_arguments
23
23
  from ..auto import CONFIG_MAPPING, AutoConfig
24
24
  from ..auto.modeling_auto import AutoModel
25
25
  from ..grounding_dino.modeling_grounding_dino import (
@@ -52,18 +52,6 @@ class MMGroundingDinoConfig(PreTrainedConfig):
52
52
  Args:
53
53
  backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
54
54
  The configuration of the backbone model.
55
- backbone (`str`, *optional*):
56
- Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
57
- will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
58
- is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
59
- use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
60
- Whether to use pretrained weights for the backbone.
61
- use_timm_backbone (`bool`, *optional*, defaults to `False`):
62
- Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
63
- library.
64
- backbone_kwargs (`dict`, *optional*):
65
- Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
66
- e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
67
55
  text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `BertConfig`):
68
56
  The config object or dictionary of the text backbone.
69
57
  num_queries (`int`, *optional*, defaults to 900):
@@ -140,6 +128,8 @@ class MMGroundingDinoConfig(PreTrainedConfig):
140
128
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
141
129
  layer_norm_eps (`float`, *optional*, defaults to 1e-05):
142
130
  The epsilon used by the layer normalization layers.
131
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
132
+ Whether to tie weight embeddings
143
133
 
144
134
  Examples:
145
135
 
@@ -166,10 +156,6 @@ class MMGroundingDinoConfig(PreTrainedConfig):
166
156
  def __init__(
167
157
  self,
168
158
  backbone_config=None,
169
- backbone=None,
170
- use_pretrained_backbone=False,
171
- use_timm_backbone=False,
172
- backbone_kwargs=None,
173
159
  text_config=None,
174
160
  num_queries=900,
175
161
  encoder_layers=6,
@@ -207,40 +193,17 @@ class MMGroundingDinoConfig(PreTrainedConfig):
207
193
  positional_embedding_temperature=20,
208
194
  init_std=0.02,
209
195
  layer_norm_eps=1e-5,
196
+ tie_word_embeddings=True,
210
197
  **kwargs,
211
198
  ):
212
- if backbone_config is None and backbone is None:
213
- logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.")
214
- backbone_config = CONFIG_MAPPING["swin"](
215
- window_size=7,
216
- image_size=224,
217
- embed_dim=96,
218
- depths=[2, 2, 6, 2],
219
- num_heads=[3, 6, 12, 24],
220
- out_indices=[2, 3, 4],
221
- )
222
- elif isinstance(backbone_config, dict):
223
- backbone_model_type = backbone_config.pop("model_type")
224
- config_class = CONFIG_MAPPING[backbone_model_type]
225
- backbone_config = config_class.from_dict(backbone_config)
226
-
227
- verify_backbone_config_arguments(
228
- use_timm_backbone=use_timm_backbone,
229
- use_pretrained_backbone=use_pretrained_backbone,
230
- backbone=backbone,
199
+ backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
231
200
  backbone_config=backbone_config,
232
- backbone_kwargs=backbone_kwargs,
201
+ default_config_type="swin",
202
+ default_config_kwargs={"out_indices": [2, 3, 4]},
203
+ **kwargs,
233
204
  )
234
205
 
235
- if text_config is None:
236
- text_config = {}
237
- logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).")
238
-
239
206
  self.backbone_config = backbone_config
240
- self.backbone = backbone
241
- self.use_pretrained_backbone = use_pretrained_backbone
242
- self.use_timm_backbone = use_timm_backbone
243
- self.backbone_kwargs = backbone_kwargs
244
207
  self.num_queries = num_queries
245
208
  self.d_model = d_model
246
209
  self.encoder_ffn_dim = encoder_ffn_dim
@@ -274,6 +237,7 @@ class MMGroundingDinoConfig(PreTrainedConfig):
274
237
  text_config["model_type"] = text_config.get("model_type", "bert")
275
238
  text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
276
239
  elif text_config is None:
240
+ logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).")
277
241
  text_config = CONFIG_MAPPING["bert"]()
278
242
 
279
243
  self.text_config = text_config
@@ -290,6 +254,7 @@ class MMGroundingDinoConfig(PreTrainedConfig):
290
254
  self.positional_embedding_temperature = positional_embedding_temperature
291
255
  self.init_std = init_std
292
256
  self.layer_norm_eps = layer_norm_eps
257
+ self.tie_word_embeddings = tie_word_embeddings
293
258
 
294
259
  super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
295
260
 
@@ -124,9 +124,12 @@ class MobileBertConfig(PreTrainedConfig):
124
124
  normalization_type="no_norm",
125
125
  classifier_activation=True,
126
126
  classifier_dropout=None,
127
+ tie_word_embeddings=True,
127
128
  **kwargs,
128
129
  ):
129
- super().__init__(pad_token_id=pad_token_id, **kwargs)
130
+ super().__init__(**kwargs)
131
+ self.pad_token_id = pad_token_id
132
+ self.tie_word_embeddings = tie_word_embeddings
130
133
 
131
134
  self.vocab_size = vocab_size
132
135
  self.hidden_size = hidden_size
@@ -204,9 +204,9 @@ class MobileBertSelfAttention(nn.Module):
204
204
  key_layer = self.key(key_tensor).view(*hidden_shape).transpose(1, 2)
205
205
  value_layer = self.value(value_tensor).view(*hidden_shape).transpose(1, 2)
206
206
 
207
- attention_interface: Callable = eager_attention_forward
208
- if self.config._attn_implementation != "eager":
209
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
207
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
208
+ self.config._attn_implementation, eager_attention_forward
209
+ )
210
210
 
211
211
  attn_output, attn_weights = attention_interface(
212
212
  self,