transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1021) hide show
  1. transformers/__init__.py +4 -11
  2. transformers/activations.py +2 -2
  3. transformers/backbone_utils.py +326 -0
  4. transformers/cache_utils.py +11 -2
  5. transformers/cli/serve.py +11 -8
  6. transformers/configuration_utils.py +1 -69
  7. transformers/conversion_mapping.py +146 -26
  8. transformers/convert_slow_tokenizer.py +6 -4
  9. transformers/core_model_loading.py +207 -118
  10. transformers/dependency_versions_check.py +0 -1
  11. transformers/dependency_versions_table.py +7 -8
  12. transformers/file_utils.py +0 -2
  13. transformers/generation/candidate_generator.py +1 -2
  14. transformers/generation/continuous_batching/cache.py +40 -38
  15. transformers/generation/continuous_batching/cache_manager.py +3 -16
  16. transformers/generation/continuous_batching/continuous_api.py +94 -406
  17. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  18. transformers/generation/continuous_batching/requests.py +54 -17
  19. transformers/generation/continuous_batching/scheduler.py +77 -95
  20. transformers/generation/logits_process.py +10 -5
  21. transformers/generation/stopping_criteria.py +1 -2
  22. transformers/generation/utils.py +75 -95
  23. transformers/image_processing_utils.py +0 -3
  24. transformers/image_processing_utils_fast.py +17 -18
  25. transformers/image_transforms.py +44 -13
  26. transformers/image_utils.py +0 -5
  27. transformers/initialization.py +57 -0
  28. transformers/integrations/__init__.py +10 -24
  29. transformers/integrations/accelerate.py +47 -11
  30. transformers/integrations/deepspeed.py +145 -3
  31. transformers/integrations/executorch.py +2 -6
  32. transformers/integrations/finegrained_fp8.py +142 -7
  33. transformers/integrations/flash_attention.py +2 -7
  34. transformers/integrations/hub_kernels.py +18 -7
  35. transformers/integrations/moe.py +226 -106
  36. transformers/integrations/mxfp4.py +47 -34
  37. transformers/integrations/peft.py +488 -176
  38. transformers/integrations/tensor_parallel.py +641 -581
  39. transformers/masking_utils.py +153 -9
  40. transformers/modeling_flash_attention_utils.py +1 -2
  41. transformers/modeling_utils.py +359 -358
  42. transformers/models/__init__.py +6 -0
  43. transformers/models/afmoe/configuration_afmoe.py +14 -4
  44. transformers/models/afmoe/modeling_afmoe.py +8 -8
  45. transformers/models/afmoe/modular_afmoe.py +7 -7
  46. transformers/models/aimv2/configuration_aimv2.py +2 -7
  47. transformers/models/aimv2/modeling_aimv2.py +26 -24
  48. transformers/models/aimv2/modular_aimv2.py +8 -12
  49. transformers/models/albert/configuration_albert.py +8 -1
  50. transformers/models/albert/modeling_albert.py +3 -3
  51. transformers/models/align/configuration_align.py +8 -5
  52. transformers/models/align/modeling_align.py +22 -24
  53. transformers/models/altclip/configuration_altclip.py +4 -6
  54. transformers/models/altclip/modeling_altclip.py +30 -26
  55. transformers/models/apertus/configuration_apertus.py +5 -7
  56. transformers/models/apertus/modeling_apertus.py +4 -4
  57. transformers/models/apertus/modular_apertus.py +8 -10
  58. transformers/models/arcee/configuration_arcee.py +5 -7
  59. transformers/models/arcee/modeling_arcee.py +4 -4
  60. transformers/models/aria/configuration_aria.py +11 -21
  61. transformers/models/aria/modeling_aria.py +39 -36
  62. transformers/models/aria/modular_aria.py +33 -39
  63. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
  64. transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
  65. transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
  66. transformers/models/auto/auto_factory.py +8 -6
  67. transformers/models/auto/configuration_auto.py +22 -0
  68. transformers/models/auto/image_processing_auto.py +17 -13
  69. transformers/models/auto/modeling_auto.py +15 -0
  70. transformers/models/auto/processing_auto.py +9 -18
  71. transformers/models/auto/tokenization_auto.py +17 -15
  72. transformers/models/autoformer/modeling_autoformer.py +2 -1
  73. transformers/models/aya_vision/configuration_aya_vision.py +4 -0
  74. transformers/models/aya_vision/modeling_aya_vision.py +29 -62
  75. transformers/models/aya_vision/modular_aya_vision.py +20 -45
  76. transformers/models/bamba/configuration_bamba.py +17 -7
  77. transformers/models/bamba/modeling_bamba.py +23 -55
  78. transformers/models/bamba/modular_bamba.py +19 -54
  79. transformers/models/bark/configuration_bark.py +2 -1
  80. transformers/models/bark/modeling_bark.py +24 -10
  81. transformers/models/bart/configuration_bart.py +9 -4
  82. transformers/models/bart/modeling_bart.py +9 -12
  83. transformers/models/beit/configuration_beit.py +2 -4
  84. transformers/models/beit/image_processing_beit_fast.py +3 -3
  85. transformers/models/beit/modeling_beit.py +14 -9
  86. transformers/models/bert/configuration_bert.py +12 -1
  87. transformers/models/bert/modeling_bert.py +6 -30
  88. transformers/models/bert_generation/configuration_bert_generation.py +17 -1
  89. transformers/models/bert_generation/modeling_bert_generation.py +6 -6
  90. transformers/models/big_bird/configuration_big_bird.py +12 -8
  91. transformers/models/big_bird/modeling_big_bird.py +0 -15
  92. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
  93. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
  94. transformers/models/biogpt/configuration_biogpt.py +8 -1
  95. transformers/models/biogpt/modeling_biogpt.py +4 -8
  96. transformers/models/biogpt/modular_biogpt.py +1 -5
  97. transformers/models/bit/configuration_bit.py +2 -4
  98. transformers/models/bit/modeling_bit.py +6 -5
  99. transformers/models/bitnet/configuration_bitnet.py +5 -7
  100. transformers/models/bitnet/modeling_bitnet.py +3 -4
  101. transformers/models/bitnet/modular_bitnet.py +3 -4
  102. transformers/models/blenderbot/configuration_blenderbot.py +8 -4
  103. transformers/models/blenderbot/modeling_blenderbot.py +4 -4
  104. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
  105. transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
  106. transformers/models/blip/configuration_blip.py +9 -9
  107. transformers/models/blip/modeling_blip.py +55 -37
  108. transformers/models/blip_2/configuration_blip_2.py +2 -1
  109. transformers/models/blip_2/modeling_blip_2.py +81 -56
  110. transformers/models/bloom/configuration_bloom.py +5 -1
  111. transformers/models/bloom/modeling_bloom.py +2 -1
  112. transformers/models/blt/configuration_blt.py +23 -12
  113. transformers/models/blt/modeling_blt.py +20 -14
  114. transformers/models/blt/modular_blt.py +70 -10
  115. transformers/models/bridgetower/configuration_bridgetower.py +7 -1
  116. transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
  117. transformers/models/bridgetower/modeling_bridgetower.py +29 -15
  118. transformers/models/bros/configuration_bros.py +24 -17
  119. transformers/models/camembert/configuration_camembert.py +8 -1
  120. transformers/models/camembert/modeling_camembert.py +6 -6
  121. transformers/models/canine/configuration_canine.py +4 -1
  122. transformers/models/chameleon/configuration_chameleon.py +5 -7
  123. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
  124. transformers/models/chameleon/modeling_chameleon.py +82 -36
  125. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
  126. transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
  127. transformers/models/clap/configuration_clap.py +4 -8
  128. transformers/models/clap/modeling_clap.py +21 -22
  129. transformers/models/clip/configuration_clip.py +4 -1
  130. transformers/models/clip/image_processing_clip_fast.py +9 -0
  131. transformers/models/clip/modeling_clip.py +25 -22
  132. transformers/models/clipseg/configuration_clipseg.py +4 -1
  133. transformers/models/clipseg/modeling_clipseg.py +27 -25
  134. transformers/models/clipseg/processing_clipseg.py +11 -3
  135. transformers/models/clvp/configuration_clvp.py +14 -2
  136. transformers/models/clvp/modeling_clvp.py +19 -30
  137. transformers/models/codegen/configuration_codegen.py +4 -3
  138. transformers/models/codegen/modeling_codegen.py +2 -1
  139. transformers/models/cohere/configuration_cohere.py +5 -7
  140. transformers/models/cohere/modeling_cohere.py +4 -4
  141. transformers/models/cohere/modular_cohere.py +3 -3
  142. transformers/models/cohere2/configuration_cohere2.py +6 -8
  143. transformers/models/cohere2/modeling_cohere2.py +4 -4
  144. transformers/models/cohere2/modular_cohere2.py +9 -11
  145. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  146. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
  147. transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
  148. transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
  149. transformers/models/colqwen2/modeling_colqwen2.py +7 -6
  150. transformers/models/colqwen2/modular_colqwen2.py +7 -6
  151. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
  152. transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
  153. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
  154. transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
  155. transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
  156. transformers/models/convbert/configuration_convbert.py +11 -7
  157. transformers/models/convnext/configuration_convnext.py +2 -4
  158. transformers/models/convnext/image_processing_convnext_fast.py +2 -2
  159. transformers/models/convnext/modeling_convnext.py +7 -6
  160. transformers/models/convnextv2/configuration_convnextv2.py +2 -4
  161. transformers/models/convnextv2/modeling_convnextv2.py +7 -6
  162. transformers/models/cpmant/configuration_cpmant.py +4 -0
  163. transformers/models/csm/configuration_csm.py +9 -15
  164. transformers/models/csm/modeling_csm.py +3 -3
  165. transformers/models/ctrl/configuration_ctrl.py +16 -0
  166. transformers/models/ctrl/modeling_ctrl.py +13 -25
  167. transformers/models/cwm/configuration_cwm.py +5 -7
  168. transformers/models/cwm/modeling_cwm.py +4 -4
  169. transformers/models/d_fine/configuration_d_fine.py +10 -56
  170. transformers/models/d_fine/modeling_d_fine.py +728 -868
  171. transformers/models/d_fine/modular_d_fine.py +335 -412
  172. transformers/models/dab_detr/configuration_dab_detr.py +22 -48
  173. transformers/models/dab_detr/modeling_dab_detr.py +11 -7
  174. transformers/models/dac/modeling_dac.py +1 -1
  175. transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
  176. transformers/models/data2vec/configuration_data2vec_text.py +11 -2
  177. transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
  178. transformers/models/data2vec/modeling_data2vec_text.py +6 -6
  179. transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
  180. transformers/models/dbrx/configuration_dbrx.py +11 -3
  181. transformers/models/dbrx/modeling_dbrx.py +6 -6
  182. transformers/models/dbrx/modular_dbrx.py +6 -6
  183. transformers/models/deberta/configuration_deberta.py +6 -0
  184. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
  185. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
  186. transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
  187. transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
  188. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
  189. transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
  190. transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
  191. transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
  192. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
  193. transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
  194. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
  195. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
  196. transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
  197. transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
  198. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
  199. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
  200. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
  201. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
  202. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
  203. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
  204. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
  205. transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
  206. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
  207. transformers/models/deit/modeling_deit.py +11 -7
  208. transformers/models/depth_anything/configuration_depth_anything.py +12 -42
  209. transformers/models/depth_anything/modeling_depth_anything.py +5 -3
  210. transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
  211. transformers/models/depth_pro/modeling_depth_pro.py +8 -4
  212. transformers/models/detr/configuration_detr.py +18 -49
  213. transformers/models/detr/image_processing_detr_fast.py +11 -11
  214. transformers/models/detr/modeling_detr.py +695 -734
  215. transformers/models/dia/configuration_dia.py +4 -7
  216. transformers/models/dia/generation_dia.py +8 -17
  217. transformers/models/dia/modeling_dia.py +7 -7
  218. transformers/models/dia/modular_dia.py +4 -4
  219. transformers/models/diffllama/configuration_diffllama.py +5 -7
  220. transformers/models/diffllama/modeling_diffllama.py +3 -8
  221. transformers/models/diffllama/modular_diffllama.py +2 -7
  222. transformers/models/dinat/configuration_dinat.py +2 -4
  223. transformers/models/dinat/modeling_dinat.py +7 -6
  224. transformers/models/dinov2/configuration_dinov2.py +2 -4
  225. transformers/models/dinov2/modeling_dinov2.py +9 -8
  226. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
  227. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
  228. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
  229. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
  230. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
  231. transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
  232. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
  233. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
  234. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
  235. transformers/models/distilbert/configuration_distilbert.py +8 -1
  236. transformers/models/distilbert/modeling_distilbert.py +3 -3
  237. transformers/models/doge/configuration_doge.py +17 -7
  238. transformers/models/doge/modeling_doge.py +4 -4
  239. transformers/models/doge/modular_doge.py +20 -10
  240. transformers/models/donut/image_processing_donut_fast.py +4 -4
  241. transformers/models/dots1/configuration_dots1.py +16 -7
  242. transformers/models/dots1/modeling_dots1.py +4 -4
  243. transformers/models/dpr/configuration_dpr.py +19 -1
  244. transformers/models/dpt/configuration_dpt.py +23 -65
  245. transformers/models/dpt/image_processing_dpt_fast.py +5 -5
  246. transformers/models/dpt/modeling_dpt.py +19 -15
  247. transformers/models/dpt/modular_dpt.py +4 -4
  248. transformers/models/edgetam/configuration_edgetam.py +1 -1
  249. transformers/models/edgetam/modeling_edgetam.py +53 -53
  250. transformers/models/edgetam/modular_edgetam.py +5 -7
  251. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
  252. transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
  253. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
  254. transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
  255. transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
  256. transformers/models/electra/configuration_electra.py +13 -2
  257. transformers/models/electra/modeling_electra.py +6 -6
  258. transformers/models/emu3/configuration_emu3.py +12 -10
  259. transformers/models/emu3/modeling_emu3.py +84 -47
  260. transformers/models/emu3/modular_emu3.py +77 -39
  261. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
  262. transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
  263. transformers/models/eomt/configuration_eomt.py +12 -13
  264. transformers/models/eomt/image_processing_eomt_fast.py +3 -3
  265. transformers/models/eomt/modeling_eomt.py +3 -3
  266. transformers/models/eomt/modular_eomt.py +17 -17
  267. transformers/models/eomt_dinov3/__init__.py +28 -0
  268. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  269. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  270. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  271. transformers/models/ernie/configuration_ernie.py +24 -2
  272. transformers/models/ernie/modeling_ernie.py +6 -30
  273. transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
  274. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  275. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
  276. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
  277. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
  278. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
  279. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
  280. transformers/models/esm/configuration_esm.py +9 -11
  281. transformers/models/esm/modeling_esm.py +3 -3
  282. transformers/models/esm/modeling_esmfold.py +1 -6
  283. transformers/models/esm/openfold_utils/protein.py +2 -3
  284. transformers/models/evolla/configuration_evolla.py +21 -8
  285. transformers/models/evolla/modeling_evolla.py +11 -7
  286. transformers/models/evolla/modular_evolla.py +5 -1
  287. transformers/models/exaone4/configuration_exaone4.py +8 -5
  288. transformers/models/exaone4/modeling_exaone4.py +4 -4
  289. transformers/models/exaone4/modular_exaone4.py +11 -8
  290. transformers/models/exaone_moe/__init__.py +27 -0
  291. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  292. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  293. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  294. transformers/models/falcon/configuration_falcon.py +9 -1
  295. transformers/models/falcon/modeling_falcon.py +3 -8
  296. transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
  297. transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
  298. transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
  299. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
  300. transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
  301. transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
  302. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
  303. transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
  304. transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
  305. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
  306. transformers/models/flaubert/configuration_flaubert.py +10 -4
  307. transformers/models/flaubert/modeling_flaubert.py +1 -1
  308. transformers/models/flava/configuration_flava.py +4 -3
  309. transformers/models/flava/image_processing_flava_fast.py +4 -4
  310. transformers/models/flava/modeling_flava.py +36 -28
  311. transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
  312. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
  313. transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
  314. transformers/models/florence2/configuration_florence2.py +4 -0
  315. transformers/models/florence2/modeling_florence2.py +57 -32
  316. transformers/models/florence2/modular_florence2.py +48 -26
  317. transformers/models/fnet/configuration_fnet.py +6 -1
  318. transformers/models/focalnet/configuration_focalnet.py +2 -4
  319. transformers/models/focalnet/modeling_focalnet.py +10 -7
  320. transformers/models/fsmt/configuration_fsmt.py +12 -16
  321. transformers/models/funnel/configuration_funnel.py +8 -0
  322. transformers/models/fuyu/configuration_fuyu.py +5 -8
  323. transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
  324. transformers/models/fuyu/modeling_fuyu.py +24 -23
  325. transformers/models/gemma/configuration_gemma.py +5 -7
  326. transformers/models/gemma/modeling_gemma.py +4 -4
  327. transformers/models/gemma/modular_gemma.py +5 -7
  328. transformers/models/gemma2/configuration_gemma2.py +5 -7
  329. transformers/models/gemma2/modeling_gemma2.py +4 -4
  330. transformers/models/gemma2/modular_gemma2.py +8 -10
  331. transformers/models/gemma3/configuration_gemma3.py +28 -22
  332. transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
  333. transformers/models/gemma3/modeling_gemma3.py +37 -33
  334. transformers/models/gemma3/modular_gemma3.py +46 -42
  335. transformers/models/gemma3n/configuration_gemma3n.py +35 -22
  336. transformers/models/gemma3n/modeling_gemma3n.py +86 -58
  337. transformers/models/gemma3n/modular_gemma3n.py +112 -75
  338. transformers/models/git/configuration_git.py +5 -7
  339. transformers/models/git/modeling_git.py +31 -41
  340. transformers/models/glm/configuration_glm.py +7 -9
  341. transformers/models/glm/modeling_glm.py +4 -4
  342. transformers/models/glm4/configuration_glm4.py +7 -9
  343. transformers/models/glm4/modeling_glm4.py +4 -4
  344. transformers/models/glm46v/configuration_glm46v.py +4 -0
  345. transformers/models/glm46v/image_processing_glm46v.py +5 -2
  346. transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
  347. transformers/models/glm46v/modeling_glm46v.py +91 -46
  348. transformers/models/glm46v/modular_glm46v.py +4 -0
  349. transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
  350. transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
  351. transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
  352. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
  353. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
  354. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
  355. transformers/models/glm4v/configuration_glm4v.py +12 -8
  356. transformers/models/glm4v/image_processing_glm4v.py +5 -2
  357. transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
  358. transformers/models/glm4v/modeling_glm4v.py +120 -63
  359. transformers/models/glm4v/modular_glm4v.py +82 -50
  360. transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
  361. transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
  362. transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
  363. transformers/models/glm_image/configuration_glm_image.py +26 -20
  364. transformers/models/glm_image/image_processing_glm_image.py +1 -1
  365. transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
  366. transformers/models/glm_image/modeling_glm_image.py +337 -236
  367. transformers/models/glm_image/modular_glm_image.py +415 -255
  368. transformers/models/glm_image/processing_glm_image.py +65 -17
  369. transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
  370. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  371. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  372. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  373. transformers/models/glmasr/modeling_glmasr.py +34 -28
  374. transformers/models/glmasr/modular_glmasr.py +23 -11
  375. transformers/models/glpn/image_processing_glpn_fast.py +3 -3
  376. transformers/models/glpn/modeling_glpn.py +4 -2
  377. transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
  378. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
  379. transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
  380. transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
  381. transformers/models/gpt2/configuration_gpt2.py +13 -1
  382. transformers/models/gpt2/modeling_gpt2.py +5 -5
  383. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
  384. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
  385. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
  386. transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
  387. transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
  388. transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
  389. transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
  390. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
  391. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
  392. transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
  393. transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
  394. transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
  395. transformers/models/gptj/configuration_gptj.py +4 -4
  396. transformers/models/gptj/modeling_gptj.py +3 -7
  397. transformers/models/granite/configuration_granite.py +5 -7
  398. transformers/models/granite/modeling_granite.py +4 -4
  399. transformers/models/granite_speech/modeling_granite_speech.py +63 -37
  400. transformers/models/granitemoe/configuration_granitemoe.py +5 -7
  401. transformers/models/granitemoe/modeling_granitemoe.py +4 -4
  402. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
  403. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
  404. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
  405. transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
  406. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
  407. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
  408. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
  409. transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
  410. transformers/models/groupvit/configuration_groupvit.py +4 -1
  411. transformers/models/groupvit/modeling_groupvit.py +29 -22
  412. transformers/models/helium/configuration_helium.py +5 -7
  413. transformers/models/helium/modeling_helium.py +4 -4
  414. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
  415. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
  416. transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
  417. transformers/models/hiera/configuration_hiera.py +2 -4
  418. transformers/models/hiera/modeling_hiera.py +11 -8
  419. transformers/models/hubert/configuration_hubert.py +4 -1
  420. transformers/models/hubert/modeling_hubert.py +7 -4
  421. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
  422. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
  423. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
  424. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
  425. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
  426. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
  427. transformers/models/ibert/configuration_ibert.py +4 -1
  428. transformers/models/idefics/configuration_idefics.py +5 -7
  429. transformers/models/idefics/modeling_idefics.py +3 -4
  430. transformers/models/idefics/vision.py +5 -4
  431. transformers/models/idefics2/configuration_idefics2.py +1 -2
  432. transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
  433. transformers/models/idefics2/modeling_idefics2.py +72 -50
  434. transformers/models/idefics3/configuration_idefics3.py +1 -3
  435. transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
  436. transformers/models/idefics3/modeling_idefics3.py +63 -40
  437. transformers/models/ijepa/modeling_ijepa.py +3 -3
  438. transformers/models/imagegpt/configuration_imagegpt.py +9 -1
  439. transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
  440. transformers/models/imagegpt/modeling_imagegpt.py +8 -4
  441. transformers/models/informer/modeling_informer.py +3 -3
  442. transformers/models/instructblip/configuration_instructblip.py +2 -1
  443. transformers/models/instructblip/modeling_instructblip.py +65 -39
  444. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
  445. transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
  446. transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
  447. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
  448. transformers/models/internvl/configuration_internvl.py +5 -0
  449. transformers/models/internvl/modeling_internvl.py +35 -55
  450. transformers/models/internvl/modular_internvl.py +26 -38
  451. transformers/models/internvl/video_processing_internvl.py +2 -2
  452. transformers/models/jais2/configuration_jais2.py +5 -7
  453. transformers/models/jais2/modeling_jais2.py +4 -4
  454. transformers/models/jamba/configuration_jamba.py +5 -7
  455. transformers/models/jamba/modeling_jamba.py +4 -4
  456. transformers/models/jamba/modular_jamba.py +3 -3
  457. transformers/models/janus/image_processing_janus.py +2 -2
  458. transformers/models/janus/image_processing_janus_fast.py +8 -8
  459. transformers/models/janus/modeling_janus.py +63 -146
  460. transformers/models/janus/modular_janus.py +62 -20
  461. transformers/models/jetmoe/configuration_jetmoe.py +6 -4
  462. transformers/models/jetmoe/modeling_jetmoe.py +3 -3
  463. transformers/models/jetmoe/modular_jetmoe.py +3 -3
  464. transformers/models/kosmos2/configuration_kosmos2.py +10 -8
  465. transformers/models/kosmos2/modeling_kosmos2.py +56 -34
  466. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
  467. transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
  468. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
  469. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
  470. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
  471. transformers/models/lasr/configuration_lasr.py +2 -4
  472. transformers/models/lasr/modeling_lasr.py +3 -3
  473. transformers/models/lasr/modular_lasr.py +3 -3
  474. transformers/models/layoutlm/configuration_layoutlm.py +14 -1
  475. transformers/models/layoutlm/modeling_layoutlm.py +3 -3
  476. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
  477. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
  478. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
  479. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
  480. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
  481. transformers/models/led/configuration_led.py +7 -8
  482. transformers/models/levit/image_processing_levit_fast.py +4 -4
  483. transformers/models/lfm2/configuration_lfm2.py +5 -7
  484. transformers/models/lfm2/modeling_lfm2.py +4 -4
  485. transformers/models/lfm2/modular_lfm2.py +3 -3
  486. transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
  487. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
  488. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  489. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
  490. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
  491. transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
  492. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
  493. transformers/models/lightglue/modeling_lightglue.py +3 -3
  494. transformers/models/lightglue/modular_lightglue.py +3 -3
  495. transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
  496. transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
  497. transformers/models/lilt/configuration_lilt.py +6 -1
  498. transformers/models/llama/configuration_llama.py +5 -7
  499. transformers/models/llama/modeling_llama.py +4 -4
  500. transformers/models/llama4/configuration_llama4.py +67 -47
  501. transformers/models/llama4/image_processing_llama4_fast.py +3 -3
  502. transformers/models/llama4/modeling_llama4.py +46 -44
  503. transformers/models/llava/configuration_llava.py +10 -0
  504. transformers/models/llava/image_processing_llava_fast.py +3 -3
  505. transformers/models/llava/modeling_llava.py +38 -65
  506. transformers/models/llava_next/configuration_llava_next.py +2 -1
  507. transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
  508. transformers/models/llava_next/modeling_llava_next.py +61 -60
  509. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
  510. transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
  511. transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
  512. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
  513. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
  514. transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
  515. transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
  516. transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
  517. transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
  518. transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
  519. transformers/models/longformer/configuration_longformer.py +4 -1
  520. transformers/models/longt5/configuration_longt5.py +9 -6
  521. transformers/models/longt5/modeling_longt5.py +2 -1
  522. transformers/models/luke/configuration_luke.py +8 -1
  523. transformers/models/lw_detr/configuration_lw_detr.py +19 -31
  524. transformers/models/lw_detr/modeling_lw_detr.py +43 -44
  525. transformers/models/lw_detr/modular_lw_detr.py +36 -38
  526. transformers/models/lxmert/configuration_lxmert.py +16 -0
  527. transformers/models/m2m_100/configuration_m2m_100.py +7 -8
  528. transformers/models/m2m_100/modeling_m2m_100.py +3 -3
  529. transformers/models/mamba/configuration_mamba.py +5 -2
  530. transformers/models/mamba/modeling_mamba.py +18 -26
  531. transformers/models/mamba2/configuration_mamba2.py +5 -7
  532. transformers/models/mamba2/modeling_mamba2.py +22 -33
  533. transformers/models/marian/configuration_marian.py +10 -4
  534. transformers/models/marian/modeling_marian.py +4 -4
  535. transformers/models/markuplm/configuration_markuplm.py +4 -6
  536. transformers/models/markuplm/modeling_markuplm.py +3 -3
  537. transformers/models/mask2former/configuration_mask2former.py +12 -47
  538. transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
  539. transformers/models/mask2former/modeling_mask2former.py +18 -12
  540. transformers/models/maskformer/configuration_maskformer.py +14 -45
  541. transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
  542. transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
  543. transformers/models/maskformer/modeling_maskformer.py +15 -9
  544. transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
  545. transformers/models/mbart/configuration_mbart.py +9 -4
  546. transformers/models/mbart/modeling_mbart.py +9 -6
  547. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
  548. transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
  549. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  550. transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
  551. transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
  552. transformers/models/mgp_str/modeling_mgp_str.py +4 -2
  553. transformers/models/mimi/configuration_mimi.py +4 -0
  554. transformers/models/mimi/modeling_mimi.py +40 -36
  555. transformers/models/minimax/configuration_minimax.py +8 -11
  556. transformers/models/minimax/modeling_minimax.py +5 -5
  557. transformers/models/minimax/modular_minimax.py +9 -12
  558. transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
  559. transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
  560. transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
  561. transformers/models/ministral/configuration_ministral.py +5 -7
  562. transformers/models/ministral/modeling_ministral.py +4 -4
  563. transformers/models/ministral/modular_ministral.py +5 -8
  564. transformers/models/ministral3/configuration_ministral3.py +4 -4
  565. transformers/models/ministral3/modeling_ministral3.py +4 -4
  566. transformers/models/ministral3/modular_ministral3.py +3 -3
  567. transformers/models/mistral/configuration_mistral.py +5 -7
  568. transformers/models/mistral/modeling_mistral.py +4 -4
  569. transformers/models/mistral/modular_mistral.py +3 -3
  570. transformers/models/mistral3/configuration_mistral3.py +4 -0
  571. transformers/models/mistral3/modeling_mistral3.py +36 -40
  572. transformers/models/mistral3/modular_mistral3.py +31 -32
  573. transformers/models/mixtral/configuration_mixtral.py +8 -11
  574. transformers/models/mixtral/modeling_mixtral.py +4 -4
  575. transformers/models/mlcd/modeling_mlcd.py +7 -5
  576. transformers/models/mlcd/modular_mlcd.py +7 -5
  577. transformers/models/mllama/configuration_mllama.py +5 -7
  578. transformers/models/mllama/image_processing_mllama_fast.py +6 -5
  579. transformers/models/mllama/modeling_mllama.py +19 -19
  580. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
  581. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
  582. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
  583. transformers/models/mobilebert/configuration_mobilebert.py +4 -1
  584. transformers/models/mobilebert/modeling_mobilebert.py +3 -3
  585. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
  586. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
  587. transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
  588. transformers/models/mobilevit/modeling_mobilevit.py +4 -2
  589. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
  590. transformers/models/modernbert/configuration_modernbert.py +46 -21
  591. transformers/models/modernbert/modeling_modernbert.py +146 -899
  592. transformers/models/modernbert/modular_modernbert.py +185 -908
  593. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
  594. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
  595. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
  596. transformers/models/moonshine/configuration_moonshine.py +12 -7
  597. transformers/models/moonshine/modeling_moonshine.py +7 -7
  598. transformers/models/moonshine/modular_moonshine.py +19 -13
  599. transformers/models/moshi/configuration_moshi.py +28 -2
  600. transformers/models/moshi/modeling_moshi.py +4 -9
  601. transformers/models/mpnet/configuration_mpnet.py +6 -1
  602. transformers/models/mpt/configuration_mpt.py +16 -0
  603. transformers/models/mra/configuration_mra.py +8 -1
  604. transformers/models/mt5/configuration_mt5.py +9 -5
  605. transformers/models/mt5/modeling_mt5.py +5 -8
  606. transformers/models/musicgen/configuration_musicgen.py +12 -7
  607. transformers/models/musicgen/modeling_musicgen.py +6 -5
  608. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
  609. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
  610. transformers/models/mvp/configuration_mvp.py +8 -4
  611. transformers/models/mvp/modeling_mvp.py +6 -4
  612. transformers/models/nanochat/configuration_nanochat.py +5 -7
  613. transformers/models/nanochat/modeling_nanochat.py +4 -4
  614. transformers/models/nanochat/modular_nanochat.py +4 -4
  615. transformers/models/nemotron/configuration_nemotron.py +5 -7
  616. transformers/models/nemotron/modeling_nemotron.py +4 -14
  617. transformers/models/nllb/tokenization_nllb.py +7 -5
  618. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
  619. transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
  620. transformers/models/nougat/image_processing_nougat_fast.py +8 -8
  621. transformers/models/nystromformer/configuration_nystromformer.py +8 -1
  622. transformers/models/olmo/configuration_olmo.py +5 -7
  623. transformers/models/olmo/modeling_olmo.py +4 -4
  624. transformers/models/olmo/modular_olmo.py +3 -3
  625. transformers/models/olmo2/configuration_olmo2.py +9 -11
  626. transformers/models/olmo2/modeling_olmo2.py +4 -4
  627. transformers/models/olmo2/modular_olmo2.py +7 -7
  628. transformers/models/olmo3/configuration_olmo3.py +10 -11
  629. transformers/models/olmo3/modeling_olmo3.py +4 -4
  630. transformers/models/olmo3/modular_olmo3.py +13 -14
  631. transformers/models/olmoe/configuration_olmoe.py +5 -7
  632. transformers/models/olmoe/modeling_olmoe.py +4 -4
  633. transformers/models/olmoe/modular_olmoe.py +3 -3
  634. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
  635. transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
  636. transformers/models/oneformer/configuration_oneformer.py +9 -46
  637. transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
  638. transformers/models/oneformer/modeling_oneformer.py +14 -9
  639. transformers/models/openai/configuration_openai.py +16 -0
  640. transformers/models/opt/configuration_opt.py +6 -6
  641. transformers/models/opt/modeling_opt.py +5 -5
  642. transformers/models/ovis2/configuration_ovis2.py +4 -0
  643. transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
  644. transformers/models/ovis2/modeling_ovis2.py +58 -99
  645. transformers/models/ovis2/modular_ovis2.py +52 -13
  646. transformers/models/owlv2/configuration_owlv2.py +4 -1
  647. transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
  648. transformers/models/owlv2/modeling_owlv2.py +40 -27
  649. transformers/models/owlv2/modular_owlv2.py +5 -5
  650. transformers/models/owlvit/configuration_owlvit.py +4 -1
  651. transformers/models/owlvit/modeling_owlvit.py +40 -27
  652. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
  653. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
  654. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
  655. transformers/models/paligemma/configuration_paligemma.py +4 -0
  656. transformers/models/paligemma/modeling_paligemma.py +30 -26
  657. transformers/models/parakeet/configuration_parakeet.py +2 -4
  658. transformers/models/parakeet/modeling_parakeet.py +3 -3
  659. transformers/models/parakeet/modular_parakeet.py +3 -3
  660. transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
  661. transformers/models/patchtst/modeling_patchtst.py +3 -3
  662. transformers/models/pe_audio/modeling_pe_audio.py +4 -4
  663. transformers/models/pe_audio/modular_pe_audio.py +1 -1
  664. transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
  665. transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
  666. transformers/models/pe_video/modeling_pe_video.py +36 -24
  667. transformers/models/pe_video/modular_pe_video.py +36 -23
  668. transformers/models/pegasus/configuration_pegasus.py +8 -5
  669. transformers/models/pegasus/modeling_pegasus.py +4 -4
  670. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
  671. transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
  672. transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
  673. transformers/models/perceiver/modeling_perceiver.py +17 -9
  674. transformers/models/perception_lm/modeling_perception_lm.py +26 -27
  675. transformers/models/perception_lm/modular_perception_lm.py +27 -25
  676. transformers/models/persimmon/configuration_persimmon.py +5 -7
  677. transformers/models/persimmon/modeling_persimmon.py +5 -5
  678. transformers/models/phi/configuration_phi.py +8 -6
  679. transformers/models/phi/modeling_phi.py +4 -4
  680. transformers/models/phi/modular_phi.py +3 -3
  681. transformers/models/phi3/configuration_phi3.py +9 -11
  682. transformers/models/phi3/modeling_phi3.py +4 -4
  683. transformers/models/phi3/modular_phi3.py +3 -3
  684. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
  685. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
  686. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
  687. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
  688. transformers/models/phimoe/configuration_phimoe.py +5 -7
  689. transformers/models/phimoe/modeling_phimoe.py +15 -39
  690. transformers/models/phimoe/modular_phimoe.py +12 -7
  691. transformers/models/pix2struct/configuration_pix2struct.py +12 -9
  692. transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
  693. transformers/models/pix2struct/modeling_pix2struct.py +14 -7
  694. transformers/models/pixio/configuration_pixio.py +2 -4
  695. transformers/models/pixio/modeling_pixio.py +9 -8
  696. transformers/models/pixio/modular_pixio.py +4 -2
  697. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
  698. transformers/models/pixtral/modeling_pixtral.py +9 -12
  699. transformers/models/plbart/configuration_plbart.py +8 -5
  700. transformers/models/plbart/modeling_plbart.py +9 -7
  701. transformers/models/plbart/modular_plbart.py +1 -1
  702. transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
  703. transformers/models/pop2piano/configuration_pop2piano.py +7 -6
  704. transformers/models/pop2piano/modeling_pop2piano.py +2 -1
  705. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  706. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  707. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  708. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  709. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  710. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
  711. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
  712. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
  713. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
  714. transformers/models/prophetnet/configuration_prophetnet.py +11 -10
  715. transformers/models/prophetnet/modeling_prophetnet.py +12 -23
  716. transformers/models/pvt/image_processing_pvt.py +7 -7
  717. transformers/models/pvt/image_processing_pvt_fast.py +1 -1
  718. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  719. transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
  720. transformers/models/qwen2/configuration_qwen2.py +14 -4
  721. transformers/models/qwen2/modeling_qwen2.py +4 -4
  722. transformers/models/qwen2/modular_qwen2.py +3 -3
  723. transformers/models/qwen2/tokenization_qwen2.py +0 -4
  724. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
  725. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
  726. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
  727. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
  728. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
  729. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
  730. transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
  731. transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
  732. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  733. transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
  734. transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
  735. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
  736. transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
  737. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
  738. transformers/models/qwen3/configuration_qwen3.py +15 -5
  739. transformers/models/qwen3/modeling_qwen3.py +4 -4
  740. transformers/models/qwen3/modular_qwen3.py +3 -3
  741. transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
  742. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  743. transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
  744. transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
  745. transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
  746. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
  747. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
  748. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
  749. transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
  750. transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
  751. transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
  752. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
  753. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
  754. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
  755. transformers/models/rag/configuration_rag.py +6 -6
  756. transformers/models/rag/modeling_rag.py +3 -3
  757. transformers/models/rag/retrieval_rag.py +1 -1
  758. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
  759. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
  760. transformers/models/reformer/configuration_reformer.py +7 -7
  761. transformers/models/rembert/configuration_rembert.py +8 -1
  762. transformers/models/rembert/modeling_rembert.py +0 -22
  763. transformers/models/resnet/configuration_resnet.py +2 -4
  764. transformers/models/resnet/modeling_resnet.py +6 -5
  765. transformers/models/roberta/configuration_roberta.py +11 -2
  766. transformers/models/roberta/modeling_roberta.py +6 -6
  767. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
  768. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
  769. transformers/models/roc_bert/configuration_roc_bert.py +8 -1
  770. transformers/models/roc_bert/modeling_roc_bert.py +6 -41
  771. transformers/models/roformer/configuration_roformer.py +13 -2
  772. transformers/models/roformer/modeling_roformer.py +0 -14
  773. transformers/models/rt_detr/configuration_rt_detr.py +8 -49
  774. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
  775. transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
  776. transformers/models/rt_detr/modeling_rt_detr.py +578 -737
  777. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
  778. transformers/models/rt_detr/modular_rt_detr.py +1508 -6
  779. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
  780. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
  781. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
  782. transformers/models/rwkv/configuration_rwkv.py +2 -3
  783. transformers/models/rwkv/modeling_rwkv.py +0 -23
  784. transformers/models/sam/configuration_sam.py +2 -0
  785. transformers/models/sam/image_processing_sam_fast.py +4 -4
  786. transformers/models/sam/modeling_sam.py +13 -8
  787. transformers/models/sam/processing_sam.py +3 -3
  788. transformers/models/sam2/configuration_sam2.py +1 -1
  789. transformers/models/sam2/modeling_sam2.py +56 -52
  790. transformers/models/sam2/modular_sam2.py +47 -55
  791. transformers/models/sam2_video/modeling_sam2_video.py +50 -51
  792. transformers/models/sam2_video/modular_sam2_video.py +12 -10
  793. transformers/models/sam3/modeling_sam3.py +43 -47
  794. transformers/models/sam3/processing_sam3.py +8 -4
  795. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
  796. transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
  797. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  798. transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
  799. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
  800. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
  801. transformers/models/sam3_video/modeling_sam3_video.py +27 -14
  802. transformers/models/sam_hq/configuration_sam_hq.py +2 -0
  803. transformers/models/sam_hq/modeling_sam_hq.py +13 -9
  804. transformers/models/sam_hq/modular_sam_hq.py +6 -6
  805. transformers/models/sam_hq/processing_sam_hq.py +7 -6
  806. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
  807. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
  808. transformers/models/seed_oss/configuration_seed_oss.py +7 -9
  809. transformers/models/seed_oss/modeling_seed_oss.py +4 -4
  810. transformers/models/seed_oss/modular_seed_oss.py +3 -3
  811. transformers/models/segformer/image_processing_segformer_fast.py +4 -4
  812. transformers/models/segformer/modeling_segformer.py +4 -2
  813. transformers/models/segformer/modular_segformer.py +3 -3
  814. transformers/models/seggpt/modeling_seggpt.py +20 -8
  815. transformers/models/sew/configuration_sew.py +4 -1
  816. transformers/models/sew/modeling_sew.py +9 -5
  817. transformers/models/sew/modular_sew.py +2 -1
  818. transformers/models/sew_d/configuration_sew_d.py +4 -1
  819. transformers/models/sew_d/modeling_sew_d.py +4 -1
  820. transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
  821. transformers/models/siglip/configuration_siglip.py +4 -1
  822. transformers/models/siglip/modeling_siglip.py +27 -71
  823. transformers/models/siglip2/__init__.py +1 -0
  824. transformers/models/siglip2/configuration_siglip2.py +4 -2
  825. transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
  826. transformers/models/siglip2/modeling_siglip2.py +37 -78
  827. transformers/models/siglip2/modular_siglip2.py +74 -25
  828. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  829. transformers/models/smollm3/configuration_smollm3.py +6 -6
  830. transformers/models/smollm3/modeling_smollm3.py +4 -4
  831. transformers/models/smollm3/modular_smollm3.py +9 -9
  832. transformers/models/smolvlm/configuration_smolvlm.py +1 -3
  833. transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
  834. transformers/models/smolvlm/modeling_smolvlm.py +75 -46
  835. transformers/models/smolvlm/modular_smolvlm.py +36 -23
  836. transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
  837. transformers/models/solar_open/__init__.py +27 -0
  838. transformers/models/solar_open/configuration_solar_open.py +184 -0
  839. transformers/models/solar_open/modeling_solar_open.py +642 -0
  840. transformers/models/solar_open/modular_solar_open.py +224 -0
  841. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
  842. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
  843. transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
  844. transformers/models/speecht5/configuration_speecht5.py +7 -8
  845. transformers/models/splinter/configuration_splinter.py +6 -6
  846. transformers/models/splinter/modeling_splinter.py +8 -3
  847. transformers/models/squeezebert/configuration_squeezebert.py +14 -1
  848. transformers/models/stablelm/configuration_stablelm.py +8 -6
  849. transformers/models/stablelm/modeling_stablelm.py +5 -5
  850. transformers/models/starcoder2/configuration_starcoder2.py +11 -5
  851. transformers/models/starcoder2/modeling_starcoder2.py +5 -5
  852. transformers/models/starcoder2/modular_starcoder2.py +4 -4
  853. transformers/models/superglue/configuration_superglue.py +4 -0
  854. transformers/models/superglue/image_processing_superglue_fast.py +4 -3
  855. transformers/models/superglue/modeling_superglue.py +9 -4
  856. transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
  857. transformers/models/superpoint/modeling_superpoint.py +4 -2
  858. transformers/models/swin/configuration_swin.py +2 -4
  859. transformers/models/swin/modeling_swin.py +11 -8
  860. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
  861. transformers/models/swin2sr/modeling_swin2sr.py +4 -2
  862. transformers/models/swinv2/configuration_swinv2.py +2 -4
  863. transformers/models/swinv2/modeling_swinv2.py +10 -7
  864. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
  865. transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
  866. transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
  867. transformers/models/t5/configuration_t5.py +9 -8
  868. transformers/models/t5/modeling_t5.py +5 -8
  869. transformers/models/t5gemma/configuration_t5gemma.py +10 -25
  870. transformers/models/t5gemma/modeling_t5gemma.py +9 -9
  871. transformers/models/t5gemma/modular_t5gemma.py +11 -24
  872. transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
  873. transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
  874. transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
  875. transformers/models/table_transformer/configuration_table_transformer.py +18 -49
  876. transformers/models/table_transformer/modeling_table_transformer.py +27 -53
  877. transformers/models/tapas/configuration_tapas.py +12 -1
  878. transformers/models/tapas/modeling_tapas.py +1 -1
  879. transformers/models/tapas/tokenization_tapas.py +1 -0
  880. transformers/models/textnet/configuration_textnet.py +4 -6
  881. transformers/models/textnet/image_processing_textnet_fast.py +3 -3
  882. transformers/models/textnet/modeling_textnet.py +15 -14
  883. transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
  884. transformers/models/timesfm/modeling_timesfm.py +5 -6
  885. transformers/models/timesfm/modular_timesfm.py +5 -6
  886. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
  887. transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
  888. transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
  889. transformers/models/trocr/configuration_trocr.py +11 -7
  890. transformers/models/trocr/modeling_trocr.py +4 -2
  891. transformers/models/tvp/configuration_tvp.py +10 -35
  892. transformers/models/tvp/image_processing_tvp_fast.py +6 -5
  893. transformers/models/tvp/modeling_tvp.py +1 -1
  894. transformers/models/udop/configuration_udop.py +16 -7
  895. transformers/models/udop/modeling_udop.py +10 -6
  896. transformers/models/umt5/configuration_umt5.py +8 -6
  897. transformers/models/umt5/modeling_umt5.py +7 -3
  898. transformers/models/unispeech/configuration_unispeech.py +4 -1
  899. transformers/models/unispeech/modeling_unispeech.py +7 -4
  900. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
  901. transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
  902. transformers/models/upernet/configuration_upernet.py +8 -35
  903. transformers/models/upernet/modeling_upernet.py +1 -1
  904. transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
  905. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  906. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  907. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
  908. transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
  909. transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
  910. transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
  911. transformers/models/video_llava/configuration_video_llava.py +4 -0
  912. transformers/models/video_llava/modeling_video_llava.py +87 -89
  913. transformers/models/videomae/modeling_videomae.py +4 -5
  914. transformers/models/vilt/configuration_vilt.py +4 -1
  915. transformers/models/vilt/image_processing_vilt_fast.py +6 -6
  916. transformers/models/vilt/modeling_vilt.py +27 -12
  917. transformers/models/vipllava/configuration_vipllava.py +4 -0
  918. transformers/models/vipllava/modeling_vipllava.py +57 -31
  919. transformers/models/vipllava/modular_vipllava.py +50 -24
  920. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
  921. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
  922. transformers/models/visual_bert/configuration_visual_bert.py +6 -1
  923. transformers/models/vit/configuration_vit.py +2 -2
  924. transformers/models/vit/modeling_vit.py +7 -5
  925. transformers/models/vit_mae/modeling_vit_mae.py +11 -7
  926. transformers/models/vit_msn/modeling_vit_msn.py +11 -7
  927. transformers/models/vitdet/configuration_vitdet.py +2 -4
  928. transformers/models/vitdet/modeling_vitdet.py +2 -3
  929. transformers/models/vitmatte/configuration_vitmatte.py +6 -35
  930. transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
  931. transformers/models/vitmatte/modeling_vitmatte.py +1 -1
  932. transformers/models/vitpose/configuration_vitpose.py +6 -43
  933. transformers/models/vitpose/modeling_vitpose.py +5 -3
  934. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
  935. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
  936. transformers/models/vits/configuration_vits.py +4 -0
  937. transformers/models/vits/modeling_vits.py +9 -7
  938. transformers/models/vivit/modeling_vivit.py +4 -4
  939. transformers/models/vjepa2/modeling_vjepa2.py +9 -9
  940. transformers/models/voxtral/configuration_voxtral.py +0 -1
  941. transformers/models/voxtral/modeling_voxtral.py +25 -24
  942. transformers/models/voxtral/modular_voxtral.py +26 -20
  943. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
  944. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
  945. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
  946. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
  947. transformers/models/wavlm/configuration_wavlm.py +4 -1
  948. transformers/models/wavlm/modeling_wavlm.py +4 -1
  949. transformers/models/whisper/configuration_whisper.py +6 -4
  950. transformers/models/whisper/generation_whisper.py +0 -1
  951. transformers/models/whisper/modeling_whisper.py +3 -3
  952. transformers/models/x_clip/configuration_x_clip.py +4 -1
  953. transformers/models/x_clip/modeling_x_clip.py +26 -27
  954. transformers/models/xglm/configuration_xglm.py +9 -7
  955. transformers/models/xlm/configuration_xlm.py +10 -7
  956. transformers/models/xlm/modeling_xlm.py +1 -1
  957. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
  958. transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
  959. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
  960. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
  961. transformers/models/xlnet/configuration_xlnet.py +3 -1
  962. transformers/models/xlstm/configuration_xlstm.py +5 -7
  963. transformers/models/xlstm/modeling_xlstm.py +0 -32
  964. transformers/models/xmod/configuration_xmod.py +11 -2
  965. transformers/models/xmod/modeling_xmod.py +13 -16
  966. transformers/models/yolos/image_processing_yolos_fast.py +25 -28
  967. transformers/models/yolos/modeling_yolos.py +7 -7
  968. transformers/models/yolos/modular_yolos.py +16 -16
  969. transformers/models/yoso/configuration_yoso.py +8 -1
  970. transformers/models/youtu/__init__.py +27 -0
  971. transformers/models/youtu/configuration_youtu.py +194 -0
  972. transformers/models/youtu/modeling_youtu.py +619 -0
  973. transformers/models/youtu/modular_youtu.py +254 -0
  974. transformers/models/zamba/configuration_zamba.py +5 -7
  975. transformers/models/zamba/modeling_zamba.py +25 -56
  976. transformers/models/zamba2/configuration_zamba2.py +8 -13
  977. transformers/models/zamba2/modeling_zamba2.py +53 -78
  978. transformers/models/zamba2/modular_zamba2.py +36 -29
  979. transformers/models/zoedepth/configuration_zoedepth.py +17 -40
  980. transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
  981. transformers/models/zoedepth/modeling_zoedepth.py +5 -3
  982. transformers/pipelines/__init__.py +1 -61
  983. transformers/pipelines/any_to_any.py +1 -1
  984. transformers/pipelines/automatic_speech_recognition.py +0 -2
  985. transformers/pipelines/base.py +1 -1
  986. transformers/pipelines/image_text_to_text.py +1 -1
  987. transformers/pipelines/text_to_audio.py +5 -1
  988. transformers/processing_utils.py +35 -44
  989. transformers/pytorch_utils.py +2 -26
  990. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  991. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  992. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  993. transformers/quantizers/quantizer_mxfp4.py +1 -1
  994. transformers/quantizers/quantizer_torchao.py +0 -16
  995. transformers/safetensors_conversion.py +11 -4
  996. transformers/testing_utils.py +3 -28
  997. transformers/tokenization_mistral_common.py +9 -0
  998. transformers/tokenization_python.py +6 -4
  999. transformers/tokenization_utils_base.py +119 -219
  1000. transformers/tokenization_utils_tokenizers.py +31 -2
  1001. transformers/trainer.py +25 -33
  1002. transformers/trainer_seq2seq.py +1 -1
  1003. transformers/training_args.py +411 -417
  1004. transformers/utils/__init__.py +1 -4
  1005. transformers/utils/auto_docstring.py +15 -18
  1006. transformers/utils/backbone_utils.py +13 -373
  1007. transformers/utils/doc.py +4 -36
  1008. transformers/utils/generic.py +69 -33
  1009. transformers/utils/import_utils.py +72 -75
  1010. transformers/utils/loading_report.py +133 -105
  1011. transformers/utils/quantization_config.py +0 -21
  1012. transformers/video_processing_utils.py +5 -5
  1013. transformers/video_utils.py +3 -1
  1014. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
  1015. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
  1016. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1017. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1018. transformers/pipelines/image_to_text.py +0 -189
  1019. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1020. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1021. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,61 @@
1
- import torch
1
+ # Copyright 2022 Microsoft Research Asia and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import math
15
+ from collections.abc import Callable
2
16
 
3
- from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
17
+ import torch
18
+ from torch import nn
4
19
 
5
20
  from ...image_transforms import (
6
21
  center_to_corners_format,
7
22
  )
23
+ from ...masking_utils import create_bidirectional_mask
24
+ from ...modeling_outputs import (
25
+ BaseModelOutput,
26
+ )
27
+ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
28
+ from ...processing_utils import Unpack
8
29
  from ...utils import (
9
30
  TensorType,
31
+ TransformersKwargs,
32
+ auto_docstring,
10
33
  logging,
11
34
  )
35
+ from ...utils.generic import OutputRecorder, can_return_tuple, check_model_inputs
36
+ from ..deformable_detr.modeling_deformable_detr import inverse_sigmoid
37
+ from ..detr.image_processing_detr_fast import DetrImageProcessorFast
38
+ from ..detr.modeling_detr import (
39
+ DetrConvEncoder,
40
+ DetrDecoderLayer,
41
+ DetrDecoderOutput,
42
+ DetrEncoder,
43
+ DetrEncoderLayer,
44
+ DetrForObjectDetection,
45
+ DetrForSegmentation,
46
+ DetrLearnedPositionEmbedding,
47
+ DetrMLP,
48
+ DetrMLPPredictionHead,
49
+ DetrModel,
50
+ DetrModelOutput,
51
+ DetrObjectDetectionOutput,
52
+ DetrPreTrainedModel,
53
+ DetrSegmentationOutput,
54
+ DetrSelfAttention,
55
+ DetrSinePositionEmbedding,
56
+ eager_attention_forward,
57
+ )
58
+ from .configuration_conditional_detr import ConditionalDetrConfig
12
59
 
13
60
 
14
61
  logger = logging.get_logger(__name__)
@@ -74,5 +121,856 @@ class ConditionalDetrImageProcessorFast(DetrImageProcessorFast):
74
121
 
75
122
  return results
76
123
 
124
+ def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple[int, int]] | None = None):
125
+ """
126
+ Converts the output of [`ConditionalDetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
127
+
128
+ Args:
129
+ outputs ([`ConditionalDetrForSegmentation`]):
130
+ Raw outputs of the model.
131
+ target_sizes (`list[tuple[int, int]]`, *optional*):
132
+ A list of tuples (`tuple[int, int]`) containing the target size (height, width) of each image in the
133
+ batch. If unset, predictions will not be resized.
134
+ Returns:
135
+ `list[torch.Tensor]`:
136
+ A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
137
+ corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
138
+ `torch.Tensor` correspond to a semantic class id.
139
+ """
140
+ class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes]
141
+ masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width]
142
+
143
+ # Conditional DETR does not have a null class, so we use all classes
144
+ masks_classes = class_queries_logits.softmax(dim=-1)
145
+ masks_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width]
146
+
147
+ # Semantic segmentation logits of shape (batch_size, num_classes, height, width)
148
+ segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
149
+ batch_size = class_queries_logits.shape[0]
150
+
151
+ # Resize logits and compute semantic segmentation maps
152
+ if target_sizes is not None:
153
+ if batch_size != len(target_sizes):
154
+ raise ValueError(
155
+ "Make sure that you pass in as many target sizes as the batch dimension of the logits"
156
+ )
157
+
158
+ semantic_segmentation = []
159
+ for idx in range(batch_size):
160
+ resized_logits = nn.functional.interpolate(
161
+ segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
162
+ )
163
+ semantic_map = resized_logits[0].argmax(dim=0)
164
+ semantic_segmentation.append(semantic_map)
165
+ else:
166
+ semantic_segmentation = segmentation.argmax(dim=1)
167
+ semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
168
+
169
+ return semantic_segmentation
170
+
171
+
172
+ class ConditionalDetrDecoderOutput(DetrDecoderOutput):
173
+ r"""
174
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
175
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
176
+ sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
177
+ used to compute the weighted average in the cross-attention heads.
178
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
179
+ Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
180
+ layernorm.
181
+ reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
182
+ Reference points (reference points of each layer of the decoder).
183
+ """
184
+
185
+ reference_points: tuple[torch.FloatTensor] | None = None
186
+
187
+
188
+ class ConditionalDetrModelOutput(DetrModelOutput):
189
+ r"""
190
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
191
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
192
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
193
+ Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
194
+ layernorm.
195
+ reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`):
196
+ Reference points (reference points of each layer of the decoder).
197
+ """
198
+
199
+ reference_points: tuple[torch.FloatTensor] | None = None
200
+
201
+
202
+ # function to generate sine positional embedding for 2d coordinates
203
+ def gen_sine_position_embeddings(pos_tensor, d_model):
204
+ scale = 2 * math.pi
205
+ dim = d_model // 2
206
+ dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device)
207
+ dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / dim)
208
+ x_embed = pos_tensor[:, :, 0] * scale
209
+ y_embed = pos_tensor[:, :, 1] * scale
210
+ pos_x = x_embed[:, :, None] / dim_t
211
+ pos_y = y_embed[:, :, None] / dim_t
212
+ pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
213
+ pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
214
+ pos = torch.cat((pos_y, pos_x), dim=2)
215
+ return pos.to(pos_tensor.dtype)
216
+
217
+
218
+ class ConditionalDetrObjectDetectionOutput(DetrObjectDetectionOutput):
219
+ pass
220
+
221
+
222
+ class ConditionalDetrSegmentationOutput(DetrSegmentationOutput):
223
+ pass
224
+
225
+
226
+ class ConditionalDetrConvEncoder(DetrConvEncoder):
227
+ pass
228
+
229
+
230
+ class ConditionalDetrSinePositionEmbedding(DetrSinePositionEmbedding):
231
+ pass
232
+
233
+
234
+ class ConditionalDetrLearnedPositionEmbedding(DetrLearnedPositionEmbedding):
235
+ pass
236
+
237
+
238
+ class ConditionalDetrSelfAttention(DetrSelfAttention):
239
+ pass
240
+
241
+
242
+ class ConditionalDetrDecoderSelfAttention(nn.Module):
243
+ """
244
+ Multi-headed self-attention for Conditional DETR decoder layers.
245
+
246
+ This attention module handles separate content and position projections, which are then combined
247
+ before applying standard self-attention. Position embeddings are added to both queries and keys.
248
+ """
249
+
250
+ def __init__(
251
+ self,
252
+ config: ConditionalDetrConfig,
253
+ hidden_size: int,
254
+ num_attention_heads: int,
255
+ dropout: float = 0.0,
256
+ ):
257
+ super().__init__()
258
+ self.config = config
259
+ self.hidden_size = hidden_size
260
+ self.head_dim = hidden_size // num_attention_heads
261
+ self.scaling = self.head_dim**-0.5
262
+ self.attention_dropout = dropout
263
+ self.is_causal = False
264
+
265
+ # Content and position projections
266
+ self.q_content_proj = nn.Linear(hidden_size, hidden_size)
267
+ self.q_pos_proj = nn.Linear(hidden_size, hidden_size)
268
+ self.k_content_proj = nn.Linear(hidden_size, hidden_size)
269
+ self.k_pos_proj = nn.Linear(hidden_size, hidden_size)
270
+ self.v_proj = nn.Linear(hidden_size, hidden_size)
271
+ self.o_proj = nn.Linear(hidden_size, hidden_size)
272
+
273
+ def forward(
274
+ self,
275
+ hidden_states: torch.Tensor,
276
+ query_position_embeddings: torch.Tensor,
277
+ attention_mask: torch.Tensor | None = None,
278
+ **kwargs: Unpack[TransformersKwargs],
279
+ ) -> tuple[torch.Tensor, torch.Tensor]:
280
+ """
281
+ Args:
282
+ hidden_states (`torch.Tensor` of shape `(batch_size, num_queries, hidden_size)`):
283
+ Input hidden states from the decoder layer.
284
+ query_position_embeddings (`torch.Tensor` of shape `(batch_size, num_queries, hidden_size)`):
285
+ Position embeddings for queries and keys. Required (unlike standard attention). Processed through
286
+ separate position projections (`q_pos_proj`, `k_pos_proj`) and added to content projections.
287
+ attention_mask (`torch.Tensor` of shape `(batch_size, 1, num_queries, num_queries)`, *optional*):
288
+ Attention mask to avoid attending to padding tokens.
289
+ """
290
+ input_shape = hidden_states.shape[:-1]
291
+ hidden_shape = (*input_shape, -1, self.head_dim)
292
+
293
+ query_states = (
294
+ (self.q_content_proj(hidden_states) + self.q_pos_proj(query_position_embeddings))
295
+ .view(hidden_shape)
296
+ .transpose(1, 2)
297
+ )
298
+ key_states = (
299
+ (self.k_content_proj(hidden_states) + self.k_pos_proj(query_position_embeddings))
300
+ .view(hidden_shape)
301
+ .transpose(1, 2)
302
+ )
303
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
304
+
305
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
306
+ self.config._attn_implementation, eager_attention_forward
307
+ )
308
+
309
+ attn_output, attn_weights = attention_interface(
310
+ self,
311
+ query_states,
312
+ key_states,
313
+ value_states,
314
+ attention_mask,
315
+ dropout=0.0 if not self.training else self.attention_dropout,
316
+ scaling=self.scaling,
317
+ **kwargs,
318
+ )
319
+
320
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
321
+ attn_output = self.o_proj(attn_output)
322
+ return attn_output, attn_weights
323
+
324
+
325
+ class ConditionalDetrDecoderCrossAttention(nn.Module):
326
+ """
327
+ Multi-headed cross-attention for Conditional DETR decoder layers.
328
+
329
+ This attention module handles the special cross-attention logic in Conditional DETR:
330
+ - Separate content and position projections for queries and keys
331
+ - Concatenation of query sine embeddings with queries (doubling query dimension)
332
+ - Concatenation of key position embeddings with keys (doubling key dimension)
333
+ - Output dimension remains hidden_size despite doubled input dimensions
334
+ """
335
+
336
+ def __init__(
337
+ self,
338
+ config: ConditionalDetrConfig,
339
+ hidden_size: int,
340
+ num_attention_heads: int,
341
+ dropout: float = 0.0,
342
+ ):
343
+ super().__init__()
344
+ self.config = config
345
+ self.hidden_size = hidden_size
346
+ self.num_attention_heads = num_attention_heads
347
+ self.head_dim = hidden_size // num_attention_heads
348
+ self.attention_dropout = dropout
349
+ self.is_causal = False
350
+
351
+ # Content and position projections
352
+ self.q_content_proj = nn.Linear(hidden_size, hidden_size)
353
+ self.q_pos_proj = nn.Linear(hidden_size, hidden_size)
354
+ self.k_content_proj = nn.Linear(hidden_size, hidden_size)
355
+ self.k_pos_proj = nn.Linear(hidden_size, hidden_size)
356
+ self.v_proj = nn.Linear(hidden_size, hidden_size)
357
+ self.q_pos_sine_proj = nn.Linear(hidden_size, hidden_size)
358
+
359
+ # Output projection: input is hidden_size * 2 (from concatenated q/k), output is hidden_size
360
+ self.o_proj = nn.Linear(hidden_size, hidden_size)
361
+
362
+ # Compute scaling for expanded head_dim (q and k have doubled dimensions after concatenation)
363
+ # This matches the original Conditional DETR implementation where embed_dim * 2 is used
364
+ expanded_head_dim = (hidden_size * 2) // num_attention_heads
365
+ self.scaling = expanded_head_dim**-0.5
366
+
367
+ def forward(
368
+ self,
369
+ hidden_states: torch.Tensor,
370
+ encoder_hidden_states: torch.Tensor,
371
+ query_sine_embed: torch.Tensor,
372
+ encoder_position_embeddings: torch.Tensor,
373
+ query_position_embeddings: torch.Tensor | None = None,
374
+ attention_mask: torch.Tensor | None = None,
375
+ **kwargs: Unpack[TransformersKwargs],
376
+ ) -> tuple[torch.Tensor, torch.Tensor]:
377
+ """
378
+ Args:
379
+ hidden_states (`torch.Tensor` of shape `(batch_size, num_queries, hidden_size)`):
380
+ Decoder hidden states (queries).
381
+ encoder_hidden_states (`torch.Tensor` of shape `(batch_size, encoder_seq_len, hidden_size)`):
382
+ Encoder output hidden states (keys and values).
383
+ query_sine_embed (`torch.Tensor` of shape `(batch_size, num_queries, hidden_size)`):
384
+ Sine position embeddings for queries. **Concatenated** (not added) with query content,
385
+ doubling the query dimension.
386
+ encoder_position_embeddings (`torch.Tensor` of shape `(batch_size, encoder_seq_len, hidden_size)`):
387
+ Position embeddings for keys. **Concatenated** (not added) with key content, doubling the key dimension.
388
+ query_position_embeddings (`torch.Tensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
389
+ Additional position embeddings. When provided (first layer only), **added** to query content
390
+ before concatenation with `query_sine_embed`. Also causes `encoder_position_embeddings` to be
391
+ added to key content before concatenation.
392
+ attention_mask (`torch.Tensor` of shape `(batch_size, 1, num_queries, encoder_seq_len)`, *optional*):
393
+ Attention mask to avoid attending to padding tokens.
394
+ """
395
+ query_input_shape = hidden_states.shape[:-1]
396
+ kv_input_shape = encoder_hidden_states.shape[:-1]
397
+ query_hidden_shape = (*query_input_shape, self.num_attention_heads, self.head_dim)
398
+ kv_hidden_shape = (*kv_input_shape, self.num_attention_heads, self.head_dim)
399
+
400
+ # Apply content and position projections
401
+ query_input = self.q_content_proj(hidden_states)
402
+ key_input = self.k_content_proj(encoder_hidden_states)
403
+ value_states = self.v_proj(encoder_hidden_states)
404
+ key_pos = self.k_pos_proj(encoder_position_embeddings)
405
+
406
+ # Combine content and position embeddings
407
+ if query_position_embeddings is not None:
408
+ query_input = query_input + self.q_pos_proj(query_position_embeddings)
409
+ key_input = key_input + key_pos
410
+
411
+ # Reshape and concatenate position embeddings (doubling head_dim)
412
+ query_input = query_input.view(query_hidden_shape)
413
+ key_input = key_input.view(kv_hidden_shape)
414
+ query_sine_embed = self.q_pos_sine_proj(query_sine_embed).view(query_hidden_shape)
415
+ key_pos = key_pos.view(kv_hidden_shape)
416
+
417
+ query_states = torch.cat([query_input, query_sine_embed], dim=-1).view(*query_input_shape, -1)
418
+ key_states = torch.cat([key_input, key_pos], dim=-1).view(*kv_input_shape, -1)
419
+
420
+ # Reshape for attention computation
421
+ expanded_head_dim = query_states.shape[-1] // self.num_attention_heads
422
+ query_states = query_states.view(*query_input_shape, self.num_attention_heads, expanded_head_dim).transpose(
423
+ 1, 2
424
+ )
425
+ key_states = key_states.view(*kv_input_shape, self.num_attention_heads, expanded_head_dim).transpose(1, 2)
426
+ value_states = value_states.view(kv_hidden_shape).transpose(1, 2)
427
+
428
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
429
+ self.config._attn_implementation, eager_attention_forward
430
+ )
431
+
432
+ attn_output, attn_weights = attention_interface(
433
+ self,
434
+ query_states,
435
+ key_states,
436
+ value_states,
437
+ attention_mask,
438
+ dropout=0.0 if not self.training else self.attention_dropout,
439
+ scaling=self.scaling,
440
+ **kwargs,
441
+ )
442
+
443
+ attn_output = attn_output.reshape(*query_input_shape, -1).contiguous()
444
+ attn_output = self.o_proj(attn_output)
445
+ return attn_output, attn_weights
446
+
447
+
448
+ class ConditionalDetrMLP(DetrMLP):
449
+ pass
450
+
451
+
452
+ class ConditionalDetrEncoderLayer(DetrEncoderLayer):
453
+ pass
454
+
455
+
456
+ class ConditionalDetrDecoderLayer(DetrDecoderLayer):
457
+ def __init__(self, config: ConditionalDetrConfig):
458
+ super().__init__()
459
+ self.self_attn = ConditionalDetrDecoderSelfAttention(
460
+ config=config,
461
+ hidden_size=self.hidden_size,
462
+ num_attention_heads=config.decoder_attention_heads,
463
+ dropout=config.attention_dropout,
464
+ )
465
+ self.encoder_attn = ConditionalDetrDecoderCrossAttention(
466
+ config=config,
467
+ hidden_size=self.hidden_size,
468
+ num_attention_heads=config.decoder_attention_heads,
469
+ dropout=config.attention_dropout,
470
+ )
471
+
472
+ def forward(
473
+ self,
474
+ hidden_states: torch.Tensor,
475
+ attention_mask: torch.Tensor | None = None,
476
+ spatial_position_embeddings: torch.Tensor | None = None,
477
+ query_position_embeddings: torch.Tensor | None = None,
478
+ query_sine_embed: torch.Tensor | None = None,
479
+ encoder_hidden_states: torch.Tensor | None = None,
480
+ encoder_attention_mask: torch.Tensor | None = None,
481
+ is_first: bool | None = False,
482
+ **kwargs: Unpack[TransformersKwargs],
483
+ ) -> torch.Tensor:
484
+ """
485
+ Args:
486
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
487
+ attention_mask (`torch.FloatTensor`): attention mask of size
488
+ `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
489
+ values.
490
+ spatial_position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
491
+ Spatial position embeddings (2D positional encodings) that are added to the queries and keys in each self-attention layer.
492
+ query_position_embeddings (`torch.FloatTensor`, *optional*):
493
+ object_queries that are added to the queries and keys
494
+ in the self-attention layer.
495
+ encoder_hidden_states (`torch.FloatTensor`):
496
+ cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
497
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
498
+ `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
499
+ values.
500
+ output_attentions (`bool`, *optional*):
501
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
502
+ returned tensors for more detail.
503
+ """
504
+ residual = hidden_states
505
+
506
+ hidden_states, _ = self.self_attn(
507
+ hidden_states=hidden_states,
508
+ query_position_embeddings=query_position_embeddings,
509
+ attention_mask=attention_mask,
510
+ **kwargs,
511
+ )
512
+
513
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
514
+ hidden_states = residual + hidden_states
515
+ hidden_states = self.self_attn_layer_norm(hidden_states)
516
+
517
+ if encoder_hidden_states is not None:
518
+ residual = hidden_states
519
+
520
+ hidden_states, _ = self.encoder_attn(
521
+ hidden_states=hidden_states,
522
+ encoder_hidden_states=encoder_hidden_states,
523
+ attention_mask=encoder_attention_mask,
524
+ query_sine_embed=query_sine_embed,
525
+ encoder_position_embeddings=spatial_position_embeddings,
526
+ # Only pass query_position_embeddings for the first layer
527
+ query_position_embeddings=query_position_embeddings if is_first else None,
528
+ **kwargs,
529
+ )
530
+
531
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
532
+ hidden_states = residual + hidden_states
533
+ hidden_states = self.encoder_attn_layer_norm(hidden_states)
534
+
535
+ # Fully Connected
536
+ residual = hidden_states
537
+ hidden_states = self.mlp(hidden_states)
538
+ hidden_states = residual + hidden_states
539
+ hidden_states = self.final_layer_norm(hidden_states)
540
+
541
+ return hidden_states
542
+
543
+
544
+ class ConditionalDetrMLPPredictionHead(DetrMLPPredictionHead):
545
+ pass
546
+
547
+
548
+ class ConditionalDetrPreTrainedModel(DetrPreTrainedModel):
549
+ _keys_to_ignore_on_load_unexpected = [
550
+ r"detr\.model\.backbone\.model\.layer\d+\.0\.downsample\.1\.num_batches_tracked"
551
+ ]
552
+
553
+
554
+ class ConditionalDetrEncoder(DetrEncoder):
555
+ pass
556
+
557
+
558
+ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
559
+ """
560
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`ConditionalDetrDecoderLayer`].
561
+
562
+ The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
563
+
564
+ Some small tweaks for Conditional DETR:
565
+
566
+ - object_queries and query_position_embeddings are added to the forward pass.
567
+ - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
568
+
569
+ Args:
570
+ config: ConditionalDetrConfig
571
+ """
572
+
573
+ _can_record_outputs = {
574
+ "hidden_states": ConditionalDetrDecoderLayer,
575
+ "attentions": OutputRecorder(ConditionalDetrDecoderSelfAttention, layer_name="self_attn", index=1),
576
+ "cross_attentions": OutputRecorder(ConditionalDetrDecoderCrossAttention, layer_name="encoder_attn", index=1),
577
+ }
578
+
579
+ def __init__(self, config: ConditionalDetrConfig):
580
+ super().__init__(config)
581
+ self.hidden_size = config.d_model
582
+
583
+ self.dropout = config.dropout
584
+ self.layerdrop = config.decoder_layerdrop
585
+
586
+ self.layers = nn.ModuleList([ConditionalDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
587
+ # in Conditional DETR, the decoder uses layernorm after the last decoder layer output
588
+ self.layernorm = nn.LayerNorm(config.d_model)
589
+
590
+ # query_scale is the FFN applied on f to generate transformation T
591
+ self.query_scale = ConditionalDetrMLPPredictionHead(self.hidden_size, self.hidden_size, self.hidden_size, 2)
592
+ self.ref_point_head = ConditionalDetrMLPPredictionHead(self.hidden_size, self.hidden_size, 2, 2)
593
+ for layer_id in range(config.decoder_layers - 1):
594
+ # Set q_pos_proj to None for layers after the first (only first layer uses query position embeddings)
595
+ self.layers[layer_id + 1].encoder_attn.q_pos_proj = None
596
+
597
+ # Initialize weights and apply final processing
598
+ self.post_init()
599
+
600
+ @check_model_inputs()
601
+ def forward(
602
+ self,
603
+ inputs_embeds=None,
604
+ attention_mask=None,
605
+ encoder_hidden_states=None,
606
+ encoder_attention_mask=None,
607
+ spatial_position_embeddings=None,
608
+ object_queries_position_embeddings=None,
609
+ **kwargs: Unpack[TransformersKwargs],
610
+ ) -> ConditionalDetrDecoderOutput:
611
+ r"""
612
+ Args:
613
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
614
+ The query embeddings that are passed into the decoder.
615
+
616
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
617
+ Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:
618
+
619
+ - 1 for queries that are **not masked**,
620
+ - 0 for queries that are **masked**.
621
+
622
+ [What are attention masks?](../glossary#attention-mask)
623
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
624
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
625
+ of the decoder.
626
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
627
+ Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
628
+ in `[0, 1]`:
629
+
630
+ - 1 for pixels that are real (i.e. **not masked**),
631
+ - 0 for pixels that are padding (i.e. **masked**).
632
+
633
+ spatial_position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
634
+ Spatial position embeddings that are added to the queries and keys in each cross-attention layer.
635
+ object_queries_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
636
+ , *optional*): Position embeddings that are added to the queries and keys in each self-attention layer.
637
+ """
638
+ if inputs_embeds is not None:
639
+ hidden_states = inputs_embeds
640
+
641
+ # expand encoder attention mask
642
+ if encoder_hidden_states is not None and encoder_attention_mask is not None:
643
+ # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
644
+ encoder_attention_mask = create_bidirectional_mask(
645
+ self.config,
646
+ inputs_embeds,
647
+ encoder_attention_mask,
648
+ )
649
+
650
+ # optional intermediate hidden states
651
+ intermediate = () if self.config.auxiliary_loss else None
652
+
653
+ reference_points_before_sigmoid = self.ref_point_head(
654
+ object_queries_position_embeddings
655
+ ) # [num_queries, batch_size, 2]
656
+ reference_points = reference_points_before_sigmoid.sigmoid().transpose(0, 1)
657
+ obj_center = reference_points[..., :2].transpose(0, 1)
658
+ # get sine embedding for the query vector
659
+ query_sine_embed_before_transformation = gen_sine_position_embeddings(obj_center, self.config.d_model)
660
+
661
+ for idx, decoder_layer in enumerate(self.layers):
662
+ if self.training:
663
+ dropout_probability = torch.rand([])
664
+ if dropout_probability < self.layerdrop:
665
+ continue
666
+ if idx == 0:
667
+ pos_transformation = 1
668
+ else:
669
+ pos_transformation = self.query_scale(hidden_states)
670
+ # apply transformation
671
+ query_sine_embed = query_sine_embed_before_transformation * pos_transformation
672
+
673
+ hidden_states = decoder_layer(
674
+ hidden_states,
675
+ None,
676
+ spatial_position_embeddings,
677
+ object_queries_position_embeddings,
678
+ query_sine_embed,
679
+ encoder_hidden_states, # as a positional argument for gradient checkpointing
680
+ encoder_attention_mask=encoder_attention_mask,
681
+ is_first=(idx == 0),
682
+ **kwargs,
683
+ )
684
+
685
+ if self.config.auxiliary_loss:
686
+ hidden_states = self.layernorm(hidden_states)
687
+ intermediate += (hidden_states,)
688
+
689
+ # finally, apply layernorm
690
+ hidden_states = self.layernorm(hidden_states)
691
+
692
+ # stack intermediate decoder activations
693
+ if self.config.auxiliary_loss:
694
+ intermediate = torch.stack(intermediate)
695
+
696
+ return ConditionalDetrDecoderOutput(
697
+ last_hidden_state=hidden_states,
698
+ intermediate_hidden_states=intermediate,
699
+ reference_points=reference_points,
700
+ )
701
+
702
+
703
+ class ConditionalDetrModel(DetrModel):
704
+ def __init__(self, config: ConditionalDetrConfig):
705
+ super().__init__(config)
706
+ self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
707
+
708
+ # Initialize weights and apply final processing
709
+ self.post_init()
710
+
711
+ @auto_docstring
712
+ @can_return_tuple
713
+ def forward(
714
+ self,
715
+ pixel_values: torch.FloatTensor,
716
+ pixel_mask: torch.LongTensor | None = None,
717
+ decoder_attention_mask: torch.LongTensor | None = None,
718
+ encoder_outputs: torch.FloatTensor | None = None,
719
+ inputs_embeds: torch.FloatTensor | None = None,
720
+ decoder_inputs_embeds: torch.FloatTensor | None = None,
721
+ **kwargs: Unpack[TransformersKwargs],
722
+ ) -> ConditionalDetrModelOutput:
723
+ r"""
724
+ decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
725
+ Not used by default. Can be used to mask object queries.
726
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
727
+ Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
728
+ can choose to directly pass a flattened representation of an image.
729
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
730
+ Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
731
+ embedded representation.
732
+
733
+ Examples:
734
+
735
+ ```python
736
+ >>> from transformers import AutoImageProcessor, AutoModel
737
+ >>> from PIL import Image
738
+ >>> import requests
739
+
740
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
741
+ >>> image = Image.open(requests.get(url, stream=True).raw)
742
+
743
+ >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/conditional-detr-resnet-50")
744
+ >>> model = AutoModel.from_pretrained("microsoft/conditional-detr-resnet-50")
745
+
746
+ >>> # prepare image for the model
747
+ >>> inputs = image_processor(images=image, return_tensors="pt")
748
+
749
+ >>> # forward pass
750
+ >>> outputs = model(**inputs)
751
+
752
+ >>> # the last hidden states are the final query embeddings of the Transformer decoder
753
+ >>> # these are of shape (batch_size, num_queries, hidden_size)
754
+ >>> last_hidden_states = outputs.last_hidden_state
755
+ >>> list(last_hidden_states.shape)
756
+ [1, 300, 256]
757
+ ```"""
758
+ batch_size, num_channels, height, width = pixel_values.shape
759
+ device = pixel_values.device
760
+
761
+ if pixel_mask is None:
762
+ pixel_mask = torch.ones(((batch_size, height, width)), device=device)
763
+
764
+ # First, sent pixel_values + pixel_mask through Backbone to obtain the features
765
+ # pixel_values should be of shape (batch_size, num_channels, height, width)
766
+ # pixel_mask should be of shape (batch_size, height, width)
767
+ features = self.backbone(pixel_values, pixel_mask)
768
+
769
+ # get final feature map and downsampled mask
770
+ feature_map, mask = features[-1]
771
+
772
+ if mask is None:
773
+ raise ValueError("Backbone does not return downsampled pixel mask")
774
+
775
+ # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
776
+ projected_feature_map = self.input_projection(feature_map)
777
+
778
+ # Generate position embeddings
779
+ spatial_position_embeddings = self.position_embedding(
780
+ shape=feature_map.shape, device=device, dtype=pixel_values.dtype, mask=mask
781
+ )
782
+
783
+ # Third, flatten the feature map of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
784
+ # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
785
+ flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
786
+
787
+ flattened_mask = mask.flatten(1)
788
+
789
+ # Fourth, sent flattened_features + flattened_mask + spatial_position_embeddings through encoder
790
+ # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size)
791
+ # flattened_mask is a Tensor of shape (batch_size, height*width)
792
+ if encoder_outputs is None:
793
+ encoder_outputs = self.encoder(
794
+ inputs_embeds=flattened_features,
795
+ attention_mask=flattened_mask,
796
+ spatial_position_embeddings=spatial_position_embeddings,
797
+ **kwargs,
798
+ )
799
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput
800
+ elif not isinstance(encoder_outputs, BaseModelOutput):
801
+ encoder_outputs = BaseModelOutput(
802
+ last_hidden_state=encoder_outputs[0],
803
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
804
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
805
+ )
806
+
807
+ # Fifth, sent query embeddings through the decoder (which is conditioned on the encoder output)
808
+ object_queries_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(
809
+ batch_size, 1, 1
810
+ )
811
+ queries = torch.zeros_like(object_queries_position_embeddings)
812
+
813
+ # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
814
+ decoder_outputs = self.decoder(
815
+ inputs_embeds=queries,
816
+ attention_mask=None,
817
+ spatial_position_embeddings=spatial_position_embeddings,
818
+ object_queries_position_embeddings=object_queries_position_embeddings,
819
+ encoder_hidden_states=encoder_outputs.last_hidden_state,
820
+ encoder_attention_mask=flattened_mask,
821
+ **kwargs,
822
+ )
823
+
824
+ return ConditionalDetrModelOutput(
825
+ last_hidden_state=decoder_outputs.last_hidden_state,
826
+ decoder_hidden_states=decoder_outputs.hidden_states,
827
+ decoder_attentions=decoder_outputs.attentions,
828
+ cross_attentions=decoder_outputs.cross_attentions,
829
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
830
+ encoder_hidden_states=encoder_outputs.hidden_states,
831
+ encoder_attentions=encoder_outputs.attentions,
832
+ intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
833
+ reference_points=decoder_outputs.reference_points,
834
+ )
835
+
836
+
837
+ class ConditionalDetrForObjectDetection(DetrForObjectDetection):
838
+ def __init__(self, config: ConditionalDetrConfig):
839
+ super().__init__(config)
840
+ self.class_labels_classifier = nn.Linear(config.d_model, config.num_labels)
841
+
842
+ # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py
843
+ def _set_aux_loss(self, outputs_class, outputs_coord):
844
+ return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
845
+
846
+ @auto_docstring
847
+ @can_return_tuple
848
+ def forward(
849
+ self,
850
+ pixel_values: torch.FloatTensor,
851
+ pixel_mask: torch.LongTensor | None = None,
852
+ decoder_attention_mask: torch.LongTensor | None = None,
853
+ encoder_outputs: torch.FloatTensor | None = None,
854
+ inputs_embeds: torch.FloatTensor | None = None,
855
+ decoder_inputs_embeds: torch.FloatTensor | None = None,
856
+ labels: list[dict] | None = None,
857
+ **kwargs: Unpack[TransformersKwargs],
858
+ ) -> ConditionalDetrObjectDetectionOutput:
859
+ r"""
860
+ decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
861
+ Not used by default. Can be used to mask object queries.
862
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
863
+ Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
864
+ can choose to directly pass a flattened representation of an image.
865
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
866
+ Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
867
+ embedded representation.
868
+ labels (`list[Dict]` of len `(batch_size,)`, *optional*):
869
+ Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
870
+ following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
871
+ respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
872
+ in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
873
+
874
+ Examples:
875
+
876
+ ```python
877
+ >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
878
+ >>> from PIL import Image
879
+ >>> import requests
880
+
881
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
882
+ >>> image = Image.open(requests.get(url, stream=True).raw)
883
+
884
+ >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/conditional-detr-resnet-50")
885
+ >>> model = AutoModelForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50")
886
+
887
+ >>> inputs = image_processor(images=image, return_tensors="pt")
888
+
889
+ >>> outputs = model(**inputs)
890
+
891
+ >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
892
+ >>> target_sizes = torch.tensor([image.size[::-1]])
893
+ >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
894
+ ... 0
895
+ ... ]
896
+ >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
897
+ ... box = [round(i, 2) for i in box.tolist()]
898
+ ... print(
899
+ ... f"Detected {model.config.id2label[label.item()]} with confidence "
900
+ ... f"{round(score.item(), 3)} at location {box}"
901
+ ... )
902
+ Detected remote with confidence 0.833 at location [38.31, 72.1, 177.63, 118.45]
903
+ Detected cat with confidence 0.831 at location [9.2, 51.38, 321.13, 469.0]
904
+ Detected cat with confidence 0.804 at location [340.3, 16.85, 642.93, 370.95]
905
+ Detected remote with confidence 0.683 at location [334.48, 73.49, 366.37, 190.01]
906
+ Detected couch with confidence 0.535 at location [0.52, 1.19, 640.35, 475.1]
907
+ ```"""
908
+ # First, sent images through CONDITIONAL_DETR base model to obtain encoder + decoder outputs
909
+ outputs = self.model(
910
+ pixel_values,
911
+ pixel_mask=pixel_mask,
912
+ decoder_attention_mask=decoder_attention_mask,
913
+ encoder_outputs=encoder_outputs,
914
+ inputs_embeds=inputs_embeds,
915
+ decoder_inputs_embeds=decoder_inputs_embeds,
916
+ **kwargs,
917
+ )
918
+
919
+ sequence_output = outputs[0]
920
+
921
+ # class logits + predicted bounding boxes
922
+ logits = self.class_labels_classifier(sequence_output)
923
+
924
+ reference = outputs.reference_points
925
+ reference_before_sigmoid = inverse_sigmoid(reference).transpose(0, 1)
926
+
927
+ hs = sequence_output
928
+ tmp = self.bbox_predictor(hs)
929
+ tmp[..., :2] += reference_before_sigmoid
930
+ pred_boxes = tmp.sigmoid()
931
+ # pred_boxes = self.bbox_predictor(sequence_output).sigmoid()
932
+
933
+ loss, loss_dict, auxiliary_outputs = None, None, None
934
+ if labels is not None:
935
+ outputs_class, outputs_coord = None, None
936
+ if self.config.auxiliary_loss:
937
+ outputs_coords = []
938
+ intermediate = outputs.intermediate_hidden_states
939
+ outputs_class = self.class_labels_classifier(intermediate)
940
+ for lvl in range(intermediate.shape[0]):
941
+ tmp = self.bbox_predictor(intermediate[lvl])
942
+ tmp[..., :2] += reference_before_sigmoid
943
+ outputs_coord = tmp.sigmoid()
944
+ outputs_coords.append(outputs_coord)
945
+ outputs_coord = torch.stack(outputs_coords)
946
+ loss, loss_dict, auxiliary_outputs = self.loss_function(
947
+ logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord
948
+ )
949
+
950
+ return ConditionalDetrObjectDetectionOutput(
951
+ loss=loss,
952
+ loss_dict=loss_dict,
953
+ logits=logits,
954
+ pred_boxes=pred_boxes,
955
+ auxiliary_outputs=auxiliary_outputs,
956
+ last_hidden_state=outputs.last_hidden_state,
957
+ decoder_hidden_states=outputs.decoder_hidden_states,
958
+ decoder_attentions=outputs.decoder_attentions,
959
+ cross_attentions=outputs.cross_attentions,
960
+ encoder_last_hidden_state=outputs.encoder_last_hidden_state,
961
+ encoder_hidden_states=outputs.encoder_hidden_states,
962
+ encoder_attentions=outputs.encoder_attentions,
963
+ )
964
+
965
+
966
+ class ConditionalDetrForSegmentation(DetrForSegmentation):
967
+ pass
968
+
77
969
 
78
- __all__ = ["ConditionalDetrImageProcessorFast"]
970
+ __all__ = [
971
+ "ConditionalDetrImageProcessorFast",
972
+ "ConditionalDetrForObjectDetection",
973
+ "ConditionalDetrForSegmentation",
974
+ "ConditionalDetrModel",
975
+ "ConditionalDetrPreTrainedModel",
976
+ ]