transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1021) hide show
  1. transformers/__init__.py +4 -11
  2. transformers/activations.py +2 -2
  3. transformers/backbone_utils.py +326 -0
  4. transformers/cache_utils.py +11 -2
  5. transformers/cli/serve.py +11 -8
  6. transformers/configuration_utils.py +1 -69
  7. transformers/conversion_mapping.py +146 -26
  8. transformers/convert_slow_tokenizer.py +6 -4
  9. transformers/core_model_loading.py +207 -118
  10. transformers/dependency_versions_check.py +0 -1
  11. transformers/dependency_versions_table.py +7 -8
  12. transformers/file_utils.py +0 -2
  13. transformers/generation/candidate_generator.py +1 -2
  14. transformers/generation/continuous_batching/cache.py +40 -38
  15. transformers/generation/continuous_batching/cache_manager.py +3 -16
  16. transformers/generation/continuous_batching/continuous_api.py +94 -406
  17. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  18. transformers/generation/continuous_batching/requests.py +54 -17
  19. transformers/generation/continuous_batching/scheduler.py +77 -95
  20. transformers/generation/logits_process.py +10 -5
  21. transformers/generation/stopping_criteria.py +1 -2
  22. transformers/generation/utils.py +75 -95
  23. transformers/image_processing_utils.py +0 -3
  24. transformers/image_processing_utils_fast.py +17 -18
  25. transformers/image_transforms.py +44 -13
  26. transformers/image_utils.py +0 -5
  27. transformers/initialization.py +57 -0
  28. transformers/integrations/__init__.py +10 -24
  29. transformers/integrations/accelerate.py +47 -11
  30. transformers/integrations/deepspeed.py +145 -3
  31. transformers/integrations/executorch.py +2 -6
  32. transformers/integrations/finegrained_fp8.py +142 -7
  33. transformers/integrations/flash_attention.py +2 -7
  34. transformers/integrations/hub_kernels.py +18 -7
  35. transformers/integrations/moe.py +226 -106
  36. transformers/integrations/mxfp4.py +47 -34
  37. transformers/integrations/peft.py +488 -176
  38. transformers/integrations/tensor_parallel.py +641 -581
  39. transformers/masking_utils.py +153 -9
  40. transformers/modeling_flash_attention_utils.py +1 -2
  41. transformers/modeling_utils.py +359 -358
  42. transformers/models/__init__.py +6 -0
  43. transformers/models/afmoe/configuration_afmoe.py +14 -4
  44. transformers/models/afmoe/modeling_afmoe.py +8 -8
  45. transformers/models/afmoe/modular_afmoe.py +7 -7
  46. transformers/models/aimv2/configuration_aimv2.py +2 -7
  47. transformers/models/aimv2/modeling_aimv2.py +26 -24
  48. transformers/models/aimv2/modular_aimv2.py +8 -12
  49. transformers/models/albert/configuration_albert.py +8 -1
  50. transformers/models/albert/modeling_albert.py +3 -3
  51. transformers/models/align/configuration_align.py +8 -5
  52. transformers/models/align/modeling_align.py +22 -24
  53. transformers/models/altclip/configuration_altclip.py +4 -6
  54. transformers/models/altclip/modeling_altclip.py +30 -26
  55. transformers/models/apertus/configuration_apertus.py +5 -7
  56. transformers/models/apertus/modeling_apertus.py +4 -4
  57. transformers/models/apertus/modular_apertus.py +8 -10
  58. transformers/models/arcee/configuration_arcee.py +5 -7
  59. transformers/models/arcee/modeling_arcee.py +4 -4
  60. transformers/models/aria/configuration_aria.py +11 -21
  61. transformers/models/aria/modeling_aria.py +39 -36
  62. transformers/models/aria/modular_aria.py +33 -39
  63. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
  64. transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
  65. transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
  66. transformers/models/auto/auto_factory.py +8 -6
  67. transformers/models/auto/configuration_auto.py +22 -0
  68. transformers/models/auto/image_processing_auto.py +17 -13
  69. transformers/models/auto/modeling_auto.py +15 -0
  70. transformers/models/auto/processing_auto.py +9 -18
  71. transformers/models/auto/tokenization_auto.py +17 -15
  72. transformers/models/autoformer/modeling_autoformer.py +2 -1
  73. transformers/models/aya_vision/configuration_aya_vision.py +4 -0
  74. transformers/models/aya_vision/modeling_aya_vision.py +29 -62
  75. transformers/models/aya_vision/modular_aya_vision.py +20 -45
  76. transformers/models/bamba/configuration_bamba.py +17 -7
  77. transformers/models/bamba/modeling_bamba.py +23 -55
  78. transformers/models/bamba/modular_bamba.py +19 -54
  79. transformers/models/bark/configuration_bark.py +2 -1
  80. transformers/models/bark/modeling_bark.py +24 -10
  81. transformers/models/bart/configuration_bart.py +9 -4
  82. transformers/models/bart/modeling_bart.py +9 -12
  83. transformers/models/beit/configuration_beit.py +2 -4
  84. transformers/models/beit/image_processing_beit_fast.py +3 -3
  85. transformers/models/beit/modeling_beit.py +14 -9
  86. transformers/models/bert/configuration_bert.py +12 -1
  87. transformers/models/bert/modeling_bert.py +6 -30
  88. transformers/models/bert_generation/configuration_bert_generation.py +17 -1
  89. transformers/models/bert_generation/modeling_bert_generation.py +6 -6
  90. transformers/models/big_bird/configuration_big_bird.py +12 -8
  91. transformers/models/big_bird/modeling_big_bird.py +0 -15
  92. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
  93. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
  94. transformers/models/biogpt/configuration_biogpt.py +8 -1
  95. transformers/models/biogpt/modeling_biogpt.py +4 -8
  96. transformers/models/biogpt/modular_biogpt.py +1 -5
  97. transformers/models/bit/configuration_bit.py +2 -4
  98. transformers/models/bit/modeling_bit.py +6 -5
  99. transformers/models/bitnet/configuration_bitnet.py +5 -7
  100. transformers/models/bitnet/modeling_bitnet.py +3 -4
  101. transformers/models/bitnet/modular_bitnet.py +3 -4
  102. transformers/models/blenderbot/configuration_blenderbot.py +8 -4
  103. transformers/models/blenderbot/modeling_blenderbot.py +4 -4
  104. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
  105. transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
  106. transformers/models/blip/configuration_blip.py +9 -9
  107. transformers/models/blip/modeling_blip.py +55 -37
  108. transformers/models/blip_2/configuration_blip_2.py +2 -1
  109. transformers/models/blip_2/modeling_blip_2.py +81 -56
  110. transformers/models/bloom/configuration_bloom.py +5 -1
  111. transformers/models/bloom/modeling_bloom.py +2 -1
  112. transformers/models/blt/configuration_blt.py +23 -12
  113. transformers/models/blt/modeling_blt.py +20 -14
  114. transformers/models/blt/modular_blt.py +70 -10
  115. transformers/models/bridgetower/configuration_bridgetower.py +7 -1
  116. transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
  117. transformers/models/bridgetower/modeling_bridgetower.py +29 -15
  118. transformers/models/bros/configuration_bros.py +24 -17
  119. transformers/models/camembert/configuration_camembert.py +8 -1
  120. transformers/models/camembert/modeling_camembert.py +6 -6
  121. transformers/models/canine/configuration_canine.py +4 -1
  122. transformers/models/chameleon/configuration_chameleon.py +5 -7
  123. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
  124. transformers/models/chameleon/modeling_chameleon.py +82 -36
  125. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
  126. transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
  127. transformers/models/clap/configuration_clap.py +4 -8
  128. transformers/models/clap/modeling_clap.py +21 -22
  129. transformers/models/clip/configuration_clip.py +4 -1
  130. transformers/models/clip/image_processing_clip_fast.py +9 -0
  131. transformers/models/clip/modeling_clip.py +25 -22
  132. transformers/models/clipseg/configuration_clipseg.py +4 -1
  133. transformers/models/clipseg/modeling_clipseg.py +27 -25
  134. transformers/models/clipseg/processing_clipseg.py +11 -3
  135. transformers/models/clvp/configuration_clvp.py +14 -2
  136. transformers/models/clvp/modeling_clvp.py +19 -30
  137. transformers/models/codegen/configuration_codegen.py +4 -3
  138. transformers/models/codegen/modeling_codegen.py +2 -1
  139. transformers/models/cohere/configuration_cohere.py +5 -7
  140. transformers/models/cohere/modeling_cohere.py +4 -4
  141. transformers/models/cohere/modular_cohere.py +3 -3
  142. transformers/models/cohere2/configuration_cohere2.py +6 -8
  143. transformers/models/cohere2/modeling_cohere2.py +4 -4
  144. transformers/models/cohere2/modular_cohere2.py +9 -11
  145. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  146. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
  147. transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
  148. transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
  149. transformers/models/colqwen2/modeling_colqwen2.py +7 -6
  150. transformers/models/colqwen2/modular_colqwen2.py +7 -6
  151. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
  152. transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
  153. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
  154. transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
  155. transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
  156. transformers/models/convbert/configuration_convbert.py +11 -7
  157. transformers/models/convnext/configuration_convnext.py +2 -4
  158. transformers/models/convnext/image_processing_convnext_fast.py +2 -2
  159. transformers/models/convnext/modeling_convnext.py +7 -6
  160. transformers/models/convnextv2/configuration_convnextv2.py +2 -4
  161. transformers/models/convnextv2/modeling_convnextv2.py +7 -6
  162. transformers/models/cpmant/configuration_cpmant.py +4 -0
  163. transformers/models/csm/configuration_csm.py +9 -15
  164. transformers/models/csm/modeling_csm.py +3 -3
  165. transformers/models/ctrl/configuration_ctrl.py +16 -0
  166. transformers/models/ctrl/modeling_ctrl.py +13 -25
  167. transformers/models/cwm/configuration_cwm.py +5 -7
  168. transformers/models/cwm/modeling_cwm.py +4 -4
  169. transformers/models/d_fine/configuration_d_fine.py +10 -56
  170. transformers/models/d_fine/modeling_d_fine.py +728 -868
  171. transformers/models/d_fine/modular_d_fine.py +335 -412
  172. transformers/models/dab_detr/configuration_dab_detr.py +22 -48
  173. transformers/models/dab_detr/modeling_dab_detr.py +11 -7
  174. transformers/models/dac/modeling_dac.py +1 -1
  175. transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
  176. transformers/models/data2vec/configuration_data2vec_text.py +11 -2
  177. transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
  178. transformers/models/data2vec/modeling_data2vec_text.py +6 -6
  179. transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
  180. transformers/models/dbrx/configuration_dbrx.py +11 -3
  181. transformers/models/dbrx/modeling_dbrx.py +6 -6
  182. transformers/models/dbrx/modular_dbrx.py +6 -6
  183. transformers/models/deberta/configuration_deberta.py +6 -0
  184. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
  185. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
  186. transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
  187. transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
  188. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
  189. transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
  190. transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
  191. transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
  192. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
  193. transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
  194. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
  195. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
  196. transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
  197. transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
  198. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
  199. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
  200. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
  201. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
  202. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
  203. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
  204. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
  205. transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
  206. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
  207. transformers/models/deit/modeling_deit.py +11 -7
  208. transformers/models/depth_anything/configuration_depth_anything.py +12 -42
  209. transformers/models/depth_anything/modeling_depth_anything.py +5 -3
  210. transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
  211. transformers/models/depth_pro/modeling_depth_pro.py +8 -4
  212. transformers/models/detr/configuration_detr.py +18 -49
  213. transformers/models/detr/image_processing_detr_fast.py +11 -11
  214. transformers/models/detr/modeling_detr.py +695 -734
  215. transformers/models/dia/configuration_dia.py +4 -7
  216. transformers/models/dia/generation_dia.py +8 -17
  217. transformers/models/dia/modeling_dia.py +7 -7
  218. transformers/models/dia/modular_dia.py +4 -4
  219. transformers/models/diffllama/configuration_diffllama.py +5 -7
  220. transformers/models/diffllama/modeling_diffllama.py +3 -8
  221. transformers/models/diffllama/modular_diffllama.py +2 -7
  222. transformers/models/dinat/configuration_dinat.py +2 -4
  223. transformers/models/dinat/modeling_dinat.py +7 -6
  224. transformers/models/dinov2/configuration_dinov2.py +2 -4
  225. transformers/models/dinov2/modeling_dinov2.py +9 -8
  226. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
  227. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
  228. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
  229. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
  230. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
  231. transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
  232. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
  233. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
  234. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
  235. transformers/models/distilbert/configuration_distilbert.py +8 -1
  236. transformers/models/distilbert/modeling_distilbert.py +3 -3
  237. transformers/models/doge/configuration_doge.py +17 -7
  238. transformers/models/doge/modeling_doge.py +4 -4
  239. transformers/models/doge/modular_doge.py +20 -10
  240. transformers/models/donut/image_processing_donut_fast.py +4 -4
  241. transformers/models/dots1/configuration_dots1.py +16 -7
  242. transformers/models/dots1/modeling_dots1.py +4 -4
  243. transformers/models/dpr/configuration_dpr.py +19 -1
  244. transformers/models/dpt/configuration_dpt.py +23 -65
  245. transformers/models/dpt/image_processing_dpt_fast.py +5 -5
  246. transformers/models/dpt/modeling_dpt.py +19 -15
  247. transformers/models/dpt/modular_dpt.py +4 -4
  248. transformers/models/edgetam/configuration_edgetam.py +1 -1
  249. transformers/models/edgetam/modeling_edgetam.py +53 -53
  250. transformers/models/edgetam/modular_edgetam.py +5 -7
  251. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
  252. transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
  253. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
  254. transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
  255. transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
  256. transformers/models/electra/configuration_electra.py +13 -2
  257. transformers/models/electra/modeling_electra.py +6 -6
  258. transformers/models/emu3/configuration_emu3.py +12 -10
  259. transformers/models/emu3/modeling_emu3.py +84 -47
  260. transformers/models/emu3/modular_emu3.py +77 -39
  261. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
  262. transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
  263. transformers/models/eomt/configuration_eomt.py +12 -13
  264. transformers/models/eomt/image_processing_eomt_fast.py +3 -3
  265. transformers/models/eomt/modeling_eomt.py +3 -3
  266. transformers/models/eomt/modular_eomt.py +17 -17
  267. transformers/models/eomt_dinov3/__init__.py +28 -0
  268. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  269. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  270. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  271. transformers/models/ernie/configuration_ernie.py +24 -2
  272. transformers/models/ernie/modeling_ernie.py +6 -30
  273. transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
  274. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  275. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
  276. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
  277. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
  278. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
  279. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
  280. transformers/models/esm/configuration_esm.py +9 -11
  281. transformers/models/esm/modeling_esm.py +3 -3
  282. transformers/models/esm/modeling_esmfold.py +1 -6
  283. transformers/models/esm/openfold_utils/protein.py +2 -3
  284. transformers/models/evolla/configuration_evolla.py +21 -8
  285. transformers/models/evolla/modeling_evolla.py +11 -7
  286. transformers/models/evolla/modular_evolla.py +5 -1
  287. transformers/models/exaone4/configuration_exaone4.py +8 -5
  288. transformers/models/exaone4/modeling_exaone4.py +4 -4
  289. transformers/models/exaone4/modular_exaone4.py +11 -8
  290. transformers/models/exaone_moe/__init__.py +27 -0
  291. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  292. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  293. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  294. transformers/models/falcon/configuration_falcon.py +9 -1
  295. transformers/models/falcon/modeling_falcon.py +3 -8
  296. transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
  297. transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
  298. transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
  299. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
  300. transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
  301. transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
  302. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
  303. transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
  304. transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
  305. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
  306. transformers/models/flaubert/configuration_flaubert.py +10 -4
  307. transformers/models/flaubert/modeling_flaubert.py +1 -1
  308. transformers/models/flava/configuration_flava.py +4 -3
  309. transformers/models/flava/image_processing_flava_fast.py +4 -4
  310. transformers/models/flava/modeling_flava.py +36 -28
  311. transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
  312. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
  313. transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
  314. transformers/models/florence2/configuration_florence2.py +4 -0
  315. transformers/models/florence2/modeling_florence2.py +57 -32
  316. transformers/models/florence2/modular_florence2.py +48 -26
  317. transformers/models/fnet/configuration_fnet.py +6 -1
  318. transformers/models/focalnet/configuration_focalnet.py +2 -4
  319. transformers/models/focalnet/modeling_focalnet.py +10 -7
  320. transformers/models/fsmt/configuration_fsmt.py +12 -16
  321. transformers/models/funnel/configuration_funnel.py +8 -0
  322. transformers/models/fuyu/configuration_fuyu.py +5 -8
  323. transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
  324. transformers/models/fuyu/modeling_fuyu.py +24 -23
  325. transformers/models/gemma/configuration_gemma.py +5 -7
  326. transformers/models/gemma/modeling_gemma.py +4 -4
  327. transformers/models/gemma/modular_gemma.py +5 -7
  328. transformers/models/gemma2/configuration_gemma2.py +5 -7
  329. transformers/models/gemma2/modeling_gemma2.py +4 -4
  330. transformers/models/gemma2/modular_gemma2.py +8 -10
  331. transformers/models/gemma3/configuration_gemma3.py +28 -22
  332. transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
  333. transformers/models/gemma3/modeling_gemma3.py +37 -33
  334. transformers/models/gemma3/modular_gemma3.py +46 -42
  335. transformers/models/gemma3n/configuration_gemma3n.py +35 -22
  336. transformers/models/gemma3n/modeling_gemma3n.py +86 -58
  337. transformers/models/gemma3n/modular_gemma3n.py +112 -75
  338. transformers/models/git/configuration_git.py +5 -7
  339. transformers/models/git/modeling_git.py +31 -41
  340. transformers/models/glm/configuration_glm.py +7 -9
  341. transformers/models/glm/modeling_glm.py +4 -4
  342. transformers/models/glm4/configuration_glm4.py +7 -9
  343. transformers/models/glm4/modeling_glm4.py +4 -4
  344. transformers/models/glm46v/configuration_glm46v.py +4 -0
  345. transformers/models/glm46v/image_processing_glm46v.py +5 -2
  346. transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
  347. transformers/models/glm46v/modeling_glm46v.py +91 -46
  348. transformers/models/glm46v/modular_glm46v.py +4 -0
  349. transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
  350. transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
  351. transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
  352. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
  353. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
  354. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
  355. transformers/models/glm4v/configuration_glm4v.py +12 -8
  356. transformers/models/glm4v/image_processing_glm4v.py +5 -2
  357. transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
  358. transformers/models/glm4v/modeling_glm4v.py +120 -63
  359. transformers/models/glm4v/modular_glm4v.py +82 -50
  360. transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
  361. transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
  362. transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
  363. transformers/models/glm_image/configuration_glm_image.py +26 -20
  364. transformers/models/glm_image/image_processing_glm_image.py +1 -1
  365. transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
  366. transformers/models/glm_image/modeling_glm_image.py +337 -236
  367. transformers/models/glm_image/modular_glm_image.py +415 -255
  368. transformers/models/glm_image/processing_glm_image.py +65 -17
  369. transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
  370. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  371. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  372. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  373. transformers/models/glmasr/modeling_glmasr.py +34 -28
  374. transformers/models/glmasr/modular_glmasr.py +23 -11
  375. transformers/models/glpn/image_processing_glpn_fast.py +3 -3
  376. transformers/models/glpn/modeling_glpn.py +4 -2
  377. transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
  378. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
  379. transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
  380. transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
  381. transformers/models/gpt2/configuration_gpt2.py +13 -1
  382. transformers/models/gpt2/modeling_gpt2.py +5 -5
  383. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
  384. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
  385. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
  386. transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
  387. transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
  388. transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
  389. transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
  390. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
  391. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
  392. transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
  393. transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
  394. transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
  395. transformers/models/gptj/configuration_gptj.py +4 -4
  396. transformers/models/gptj/modeling_gptj.py +3 -7
  397. transformers/models/granite/configuration_granite.py +5 -7
  398. transformers/models/granite/modeling_granite.py +4 -4
  399. transformers/models/granite_speech/modeling_granite_speech.py +63 -37
  400. transformers/models/granitemoe/configuration_granitemoe.py +5 -7
  401. transformers/models/granitemoe/modeling_granitemoe.py +4 -4
  402. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
  403. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
  404. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
  405. transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
  406. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
  407. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
  408. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
  409. transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
  410. transformers/models/groupvit/configuration_groupvit.py +4 -1
  411. transformers/models/groupvit/modeling_groupvit.py +29 -22
  412. transformers/models/helium/configuration_helium.py +5 -7
  413. transformers/models/helium/modeling_helium.py +4 -4
  414. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
  415. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
  416. transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
  417. transformers/models/hiera/configuration_hiera.py +2 -4
  418. transformers/models/hiera/modeling_hiera.py +11 -8
  419. transformers/models/hubert/configuration_hubert.py +4 -1
  420. transformers/models/hubert/modeling_hubert.py +7 -4
  421. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
  422. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
  423. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
  424. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
  425. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
  426. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
  427. transformers/models/ibert/configuration_ibert.py +4 -1
  428. transformers/models/idefics/configuration_idefics.py +5 -7
  429. transformers/models/idefics/modeling_idefics.py +3 -4
  430. transformers/models/idefics/vision.py +5 -4
  431. transformers/models/idefics2/configuration_idefics2.py +1 -2
  432. transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
  433. transformers/models/idefics2/modeling_idefics2.py +72 -50
  434. transformers/models/idefics3/configuration_idefics3.py +1 -3
  435. transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
  436. transformers/models/idefics3/modeling_idefics3.py +63 -40
  437. transformers/models/ijepa/modeling_ijepa.py +3 -3
  438. transformers/models/imagegpt/configuration_imagegpt.py +9 -1
  439. transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
  440. transformers/models/imagegpt/modeling_imagegpt.py +8 -4
  441. transformers/models/informer/modeling_informer.py +3 -3
  442. transformers/models/instructblip/configuration_instructblip.py +2 -1
  443. transformers/models/instructblip/modeling_instructblip.py +65 -39
  444. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
  445. transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
  446. transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
  447. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
  448. transformers/models/internvl/configuration_internvl.py +5 -0
  449. transformers/models/internvl/modeling_internvl.py +35 -55
  450. transformers/models/internvl/modular_internvl.py +26 -38
  451. transformers/models/internvl/video_processing_internvl.py +2 -2
  452. transformers/models/jais2/configuration_jais2.py +5 -7
  453. transformers/models/jais2/modeling_jais2.py +4 -4
  454. transformers/models/jamba/configuration_jamba.py +5 -7
  455. transformers/models/jamba/modeling_jamba.py +4 -4
  456. transformers/models/jamba/modular_jamba.py +3 -3
  457. transformers/models/janus/image_processing_janus.py +2 -2
  458. transformers/models/janus/image_processing_janus_fast.py +8 -8
  459. transformers/models/janus/modeling_janus.py +63 -146
  460. transformers/models/janus/modular_janus.py +62 -20
  461. transformers/models/jetmoe/configuration_jetmoe.py +6 -4
  462. transformers/models/jetmoe/modeling_jetmoe.py +3 -3
  463. transformers/models/jetmoe/modular_jetmoe.py +3 -3
  464. transformers/models/kosmos2/configuration_kosmos2.py +10 -8
  465. transformers/models/kosmos2/modeling_kosmos2.py +56 -34
  466. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
  467. transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
  468. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
  469. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
  470. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
  471. transformers/models/lasr/configuration_lasr.py +2 -4
  472. transformers/models/lasr/modeling_lasr.py +3 -3
  473. transformers/models/lasr/modular_lasr.py +3 -3
  474. transformers/models/layoutlm/configuration_layoutlm.py +14 -1
  475. transformers/models/layoutlm/modeling_layoutlm.py +3 -3
  476. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
  477. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
  478. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
  479. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
  480. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
  481. transformers/models/led/configuration_led.py +7 -8
  482. transformers/models/levit/image_processing_levit_fast.py +4 -4
  483. transformers/models/lfm2/configuration_lfm2.py +5 -7
  484. transformers/models/lfm2/modeling_lfm2.py +4 -4
  485. transformers/models/lfm2/modular_lfm2.py +3 -3
  486. transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
  487. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
  488. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  489. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
  490. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
  491. transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
  492. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
  493. transformers/models/lightglue/modeling_lightglue.py +3 -3
  494. transformers/models/lightglue/modular_lightglue.py +3 -3
  495. transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
  496. transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
  497. transformers/models/lilt/configuration_lilt.py +6 -1
  498. transformers/models/llama/configuration_llama.py +5 -7
  499. transformers/models/llama/modeling_llama.py +4 -4
  500. transformers/models/llama4/configuration_llama4.py +67 -47
  501. transformers/models/llama4/image_processing_llama4_fast.py +3 -3
  502. transformers/models/llama4/modeling_llama4.py +46 -44
  503. transformers/models/llava/configuration_llava.py +10 -0
  504. transformers/models/llava/image_processing_llava_fast.py +3 -3
  505. transformers/models/llava/modeling_llava.py +38 -65
  506. transformers/models/llava_next/configuration_llava_next.py +2 -1
  507. transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
  508. transformers/models/llava_next/modeling_llava_next.py +61 -60
  509. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
  510. transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
  511. transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
  512. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
  513. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
  514. transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
  515. transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
  516. transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
  517. transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
  518. transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
  519. transformers/models/longformer/configuration_longformer.py +4 -1
  520. transformers/models/longt5/configuration_longt5.py +9 -6
  521. transformers/models/longt5/modeling_longt5.py +2 -1
  522. transformers/models/luke/configuration_luke.py +8 -1
  523. transformers/models/lw_detr/configuration_lw_detr.py +19 -31
  524. transformers/models/lw_detr/modeling_lw_detr.py +43 -44
  525. transformers/models/lw_detr/modular_lw_detr.py +36 -38
  526. transformers/models/lxmert/configuration_lxmert.py +16 -0
  527. transformers/models/m2m_100/configuration_m2m_100.py +7 -8
  528. transformers/models/m2m_100/modeling_m2m_100.py +3 -3
  529. transformers/models/mamba/configuration_mamba.py +5 -2
  530. transformers/models/mamba/modeling_mamba.py +18 -26
  531. transformers/models/mamba2/configuration_mamba2.py +5 -7
  532. transformers/models/mamba2/modeling_mamba2.py +22 -33
  533. transformers/models/marian/configuration_marian.py +10 -4
  534. transformers/models/marian/modeling_marian.py +4 -4
  535. transformers/models/markuplm/configuration_markuplm.py +4 -6
  536. transformers/models/markuplm/modeling_markuplm.py +3 -3
  537. transformers/models/mask2former/configuration_mask2former.py +12 -47
  538. transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
  539. transformers/models/mask2former/modeling_mask2former.py +18 -12
  540. transformers/models/maskformer/configuration_maskformer.py +14 -45
  541. transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
  542. transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
  543. transformers/models/maskformer/modeling_maskformer.py +15 -9
  544. transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
  545. transformers/models/mbart/configuration_mbart.py +9 -4
  546. transformers/models/mbart/modeling_mbart.py +9 -6
  547. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
  548. transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
  549. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  550. transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
  551. transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
  552. transformers/models/mgp_str/modeling_mgp_str.py +4 -2
  553. transformers/models/mimi/configuration_mimi.py +4 -0
  554. transformers/models/mimi/modeling_mimi.py +40 -36
  555. transformers/models/minimax/configuration_minimax.py +8 -11
  556. transformers/models/minimax/modeling_minimax.py +5 -5
  557. transformers/models/minimax/modular_minimax.py +9 -12
  558. transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
  559. transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
  560. transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
  561. transformers/models/ministral/configuration_ministral.py +5 -7
  562. transformers/models/ministral/modeling_ministral.py +4 -4
  563. transformers/models/ministral/modular_ministral.py +5 -8
  564. transformers/models/ministral3/configuration_ministral3.py +4 -4
  565. transformers/models/ministral3/modeling_ministral3.py +4 -4
  566. transformers/models/ministral3/modular_ministral3.py +3 -3
  567. transformers/models/mistral/configuration_mistral.py +5 -7
  568. transformers/models/mistral/modeling_mistral.py +4 -4
  569. transformers/models/mistral/modular_mistral.py +3 -3
  570. transformers/models/mistral3/configuration_mistral3.py +4 -0
  571. transformers/models/mistral3/modeling_mistral3.py +36 -40
  572. transformers/models/mistral3/modular_mistral3.py +31 -32
  573. transformers/models/mixtral/configuration_mixtral.py +8 -11
  574. transformers/models/mixtral/modeling_mixtral.py +4 -4
  575. transformers/models/mlcd/modeling_mlcd.py +7 -5
  576. transformers/models/mlcd/modular_mlcd.py +7 -5
  577. transformers/models/mllama/configuration_mllama.py +5 -7
  578. transformers/models/mllama/image_processing_mllama_fast.py +6 -5
  579. transformers/models/mllama/modeling_mllama.py +19 -19
  580. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
  581. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
  582. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
  583. transformers/models/mobilebert/configuration_mobilebert.py +4 -1
  584. transformers/models/mobilebert/modeling_mobilebert.py +3 -3
  585. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
  586. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
  587. transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
  588. transformers/models/mobilevit/modeling_mobilevit.py +4 -2
  589. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
  590. transformers/models/modernbert/configuration_modernbert.py +46 -21
  591. transformers/models/modernbert/modeling_modernbert.py +146 -899
  592. transformers/models/modernbert/modular_modernbert.py +185 -908
  593. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
  594. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
  595. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
  596. transformers/models/moonshine/configuration_moonshine.py +12 -7
  597. transformers/models/moonshine/modeling_moonshine.py +7 -7
  598. transformers/models/moonshine/modular_moonshine.py +19 -13
  599. transformers/models/moshi/configuration_moshi.py +28 -2
  600. transformers/models/moshi/modeling_moshi.py +4 -9
  601. transformers/models/mpnet/configuration_mpnet.py +6 -1
  602. transformers/models/mpt/configuration_mpt.py +16 -0
  603. transformers/models/mra/configuration_mra.py +8 -1
  604. transformers/models/mt5/configuration_mt5.py +9 -5
  605. transformers/models/mt5/modeling_mt5.py +5 -8
  606. transformers/models/musicgen/configuration_musicgen.py +12 -7
  607. transformers/models/musicgen/modeling_musicgen.py +6 -5
  608. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
  609. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
  610. transformers/models/mvp/configuration_mvp.py +8 -4
  611. transformers/models/mvp/modeling_mvp.py +6 -4
  612. transformers/models/nanochat/configuration_nanochat.py +5 -7
  613. transformers/models/nanochat/modeling_nanochat.py +4 -4
  614. transformers/models/nanochat/modular_nanochat.py +4 -4
  615. transformers/models/nemotron/configuration_nemotron.py +5 -7
  616. transformers/models/nemotron/modeling_nemotron.py +4 -14
  617. transformers/models/nllb/tokenization_nllb.py +7 -5
  618. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
  619. transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
  620. transformers/models/nougat/image_processing_nougat_fast.py +8 -8
  621. transformers/models/nystromformer/configuration_nystromformer.py +8 -1
  622. transformers/models/olmo/configuration_olmo.py +5 -7
  623. transformers/models/olmo/modeling_olmo.py +4 -4
  624. transformers/models/olmo/modular_olmo.py +3 -3
  625. transformers/models/olmo2/configuration_olmo2.py +9 -11
  626. transformers/models/olmo2/modeling_olmo2.py +4 -4
  627. transformers/models/olmo2/modular_olmo2.py +7 -7
  628. transformers/models/olmo3/configuration_olmo3.py +10 -11
  629. transformers/models/olmo3/modeling_olmo3.py +4 -4
  630. transformers/models/olmo3/modular_olmo3.py +13 -14
  631. transformers/models/olmoe/configuration_olmoe.py +5 -7
  632. transformers/models/olmoe/modeling_olmoe.py +4 -4
  633. transformers/models/olmoe/modular_olmoe.py +3 -3
  634. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
  635. transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
  636. transformers/models/oneformer/configuration_oneformer.py +9 -46
  637. transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
  638. transformers/models/oneformer/modeling_oneformer.py +14 -9
  639. transformers/models/openai/configuration_openai.py +16 -0
  640. transformers/models/opt/configuration_opt.py +6 -6
  641. transformers/models/opt/modeling_opt.py +5 -5
  642. transformers/models/ovis2/configuration_ovis2.py +4 -0
  643. transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
  644. transformers/models/ovis2/modeling_ovis2.py +58 -99
  645. transformers/models/ovis2/modular_ovis2.py +52 -13
  646. transformers/models/owlv2/configuration_owlv2.py +4 -1
  647. transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
  648. transformers/models/owlv2/modeling_owlv2.py +40 -27
  649. transformers/models/owlv2/modular_owlv2.py +5 -5
  650. transformers/models/owlvit/configuration_owlvit.py +4 -1
  651. transformers/models/owlvit/modeling_owlvit.py +40 -27
  652. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
  653. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
  654. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
  655. transformers/models/paligemma/configuration_paligemma.py +4 -0
  656. transformers/models/paligemma/modeling_paligemma.py +30 -26
  657. transformers/models/parakeet/configuration_parakeet.py +2 -4
  658. transformers/models/parakeet/modeling_parakeet.py +3 -3
  659. transformers/models/parakeet/modular_parakeet.py +3 -3
  660. transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
  661. transformers/models/patchtst/modeling_patchtst.py +3 -3
  662. transformers/models/pe_audio/modeling_pe_audio.py +4 -4
  663. transformers/models/pe_audio/modular_pe_audio.py +1 -1
  664. transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
  665. transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
  666. transformers/models/pe_video/modeling_pe_video.py +36 -24
  667. transformers/models/pe_video/modular_pe_video.py +36 -23
  668. transformers/models/pegasus/configuration_pegasus.py +8 -5
  669. transformers/models/pegasus/modeling_pegasus.py +4 -4
  670. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
  671. transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
  672. transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
  673. transformers/models/perceiver/modeling_perceiver.py +17 -9
  674. transformers/models/perception_lm/modeling_perception_lm.py +26 -27
  675. transformers/models/perception_lm/modular_perception_lm.py +27 -25
  676. transformers/models/persimmon/configuration_persimmon.py +5 -7
  677. transformers/models/persimmon/modeling_persimmon.py +5 -5
  678. transformers/models/phi/configuration_phi.py +8 -6
  679. transformers/models/phi/modeling_phi.py +4 -4
  680. transformers/models/phi/modular_phi.py +3 -3
  681. transformers/models/phi3/configuration_phi3.py +9 -11
  682. transformers/models/phi3/modeling_phi3.py +4 -4
  683. transformers/models/phi3/modular_phi3.py +3 -3
  684. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
  685. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
  686. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
  687. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
  688. transformers/models/phimoe/configuration_phimoe.py +5 -7
  689. transformers/models/phimoe/modeling_phimoe.py +15 -39
  690. transformers/models/phimoe/modular_phimoe.py +12 -7
  691. transformers/models/pix2struct/configuration_pix2struct.py +12 -9
  692. transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
  693. transformers/models/pix2struct/modeling_pix2struct.py +14 -7
  694. transformers/models/pixio/configuration_pixio.py +2 -4
  695. transformers/models/pixio/modeling_pixio.py +9 -8
  696. transformers/models/pixio/modular_pixio.py +4 -2
  697. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
  698. transformers/models/pixtral/modeling_pixtral.py +9 -12
  699. transformers/models/plbart/configuration_plbart.py +8 -5
  700. transformers/models/plbart/modeling_plbart.py +9 -7
  701. transformers/models/plbart/modular_plbart.py +1 -1
  702. transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
  703. transformers/models/pop2piano/configuration_pop2piano.py +7 -6
  704. transformers/models/pop2piano/modeling_pop2piano.py +2 -1
  705. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  706. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  707. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  708. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  709. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  710. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
  711. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
  712. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
  713. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
  714. transformers/models/prophetnet/configuration_prophetnet.py +11 -10
  715. transformers/models/prophetnet/modeling_prophetnet.py +12 -23
  716. transformers/models/pvt/image_processing_pvt.py +7 -7
  717. transformers/models/pvt/image_processing_pvt_fast.py +1 -1
  718. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  719. transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
  720. transformers/models/qwen2/configuration_qwen2.py +14 -4
  721. transformers/models/qwen2/modeling_qwen2.py +4 -4
  722. transformers/models/qwen2/modular_qwen2.py +3 -3
  723. transformers/models/qwen2/tokenization_qwen2.py +0 -4
  724. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
  725. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
  726. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
  727. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
  728. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
  729. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
  730. transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
  731. transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
  732. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  733. transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
  734. transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
  735. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
  736. transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
  737. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
  738. transformers/models/qwen3/configuration_qwen3.py +15 -5
  739. transformers/models/qwen3/modeling_qwen3.py +4 -4
  740. transformers/models/qwen3/modular_qwen3.py +3 -3
  741. transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
  742. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  743. transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
  744. transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
  745. transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
  746. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
  747. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
  748. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
  749. transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
  750. transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
  751. transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
  752. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
  753. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
  754. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
  755. transformers/models/rag/configuration_rag.py +6 -6
  756. transformers/models/rag/modeling_rag.py +3 -3
  757. transformers/models/rag/retrieval_rag.py +1 -1
  758. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
  759. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
  760. transformers/models/reformer/configuration_reformer.py +7 -7
  761. transformers/models/rembert/configuration_rembert.py +8 -1
  762. transformers/models/rembert/modeling_rembert.py +0 -22
  763. transformers/models/resnet/configuration_resnet.py +2 -4
  764. transformers/models/resnet/modeling_resnet.py +6 -5
  765. transformers/models/roberta/configuration_roberta.py +11 -2
  766. transformers/models/roberta/modeling_roberta.py +6 -6
  767. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
  768. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
  769. transformers/models/roc_bert/configuration_roc_bert.py +8 -1
  770. transformers/models/roc_bert/modeling_roc_bert.py +6 -41
  771. transformers/models/roformer/configuration_roformer.py +13 -2
  772. transformers/models/roformer/modeling_roformer.py +0 -14
  773. transformers/models/rt_detr/configuration_rt_detr.py +8 -49
  774. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
  775. transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
  776. transformers/models/rt_detr/modeling_rt_detr.py +578 -737
  777. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
  778. transformers/models/rt_detr/modular_rt_detr.py +1508 -6
  779. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
  780. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
  781. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
  782. transformers/models/rwkv/configuration_rwkv.py +2 -3
  783. transformers/models/rwkv/modeling_rwkv.py +0 -23
  784. transformers/models/sam/configuration_sam.py +2 -0
  785. transformers/models/sam/image_processing_sam_fast.py +4 -4
  786. transformers/models/sam/modeling_sam.py +13 -8
  787. transformers/models/sam/processing_sam.py +3 -3
  788. transformers/models/sam2/configuration_sam2.py +1 -1
  789. transformers/models/sam2/modeling_sam2.py +56 -52
  790. transformers/models/sam2/modular_sam2.py +47 -55
  791. transformers/models/sam2_video/modeling_sam2_video.py +50 -51
  792. transformers/models/sam2_video/modular_sam2_video.py +12 -10
  793. transformers/models/sam3/modeling_sam3.py +43 -47
  794. transformers/models/sam3/processing_sam3.py +8 -4
  795. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
  796. transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
  797. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  798. transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
  799. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
  800. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
  801. transformers/models/sam3_video/modeling_sam3_video.py +27 -14
  802. transformers/models/sam_hq/configuration_sam_hq.py +2 -0
  803. transformers/models/sam_hq/modeling_sam_hq.py +13 -9
  804. transformers/models/sam_hq/modular_sam_hq.py +6 -6
  805. transformers/models/sam_hq/processing_sam_hq.py +7 -6
  806. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
  807. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
  808. transformers/models/seed_oss/configuration_seed_oss.py +7 -9
  809. transformers/models/seed_oss/modeling_seed_oss.py +4 -4
  810. transformers/models/seed_oss/modular_seed_oss.py +3 -3
  811. transformers/models/segformer/image_processing_segformer_fast.py +4 -4
  812. transformers/models/segformer/modeling_segformer.py +4 -2
  813. transformers/models/segformer/modular_segformer.py +3 -3
  814. transformers/models/seggpt/modeling_seggpt.py +20 -8
  815. transformers/models/sew/configuration_sew.py +4 -1
  816. transformers/models/sew/modeling_sew.py +9 -5
  817. transformers/models/sew/modular_sew.py +2 -1
  818. transformers/models/sew_d/configuration_sew_d.py +4 -1
  819. transformers/models/sew_d/modeling_sew_d.py +4 -1
  820. transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
  821. transformers/models/siglip/configuration_siglip.py +4 -1
  822. transformers/models/siglip/modeling_siglip.py +27 -71
  823. transformers/models/siglip2/__init__.py +1 -0
  824. transformers/models/siglip2/configuration_siglip2.py +4 -2
  825. transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
  826. transformers/models/siglip2/modeling_siglip2.py +37 -78
  827. transformers/models/siglip2/modular_siglip2.py +74 -25
  828. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  829. transformers/models/smollm3/configuration_smollm3.py +6 -6
  830. transformers/models/smollm3/modeling_smollm3.py +4 -4
  831. transformers/models/smollm3/modular_smollm3.py +9 -9
  832. transformers/models/smolvlm/configuration_smolvlm.py +1 -3
  833. transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
  834. transformers/models/smolvlm/modeling_smolvlm.py +75 -46
  835. transformers/models/smolvlm/modular_smolvlm.py +36 -23
  836. transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
  837. transformers/models/solar_open/__init__.py +27 -0
  838. transformers/models/solar_open/configuration_solar_open.py +184 -0
  839. transformers/models/solar_open/modeling_solar_open.py +642 -0
  840. transformers/models/solar_open/modular_solar_open.py +224 -0
  841. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
  842. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
  843. transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
  844. transformers/models/speecht5/configuration_speecht5.py +7 -8
  845. transformers/models/splinter/configuration_splinter.py +6 -6
  846. transformers/models/splinter/modeling_splinter.py +8 -3
  847. transformers/models/squeezebert/configuration_squeezebert.py +14 -1
  848. transformers/models/stablelm/configuration_stablelm.py +8 -6
  849. transformers/models/stablelm/modeling_stablelm.py +5 -5
  850. transformers/models/starcoder2/configuration_starcoder2.py +11 -5
  851. transformers/models/starcoder2/modeling_starcoder2.py +5 -5
  852. transformers/models/starcoder2/modular_starcoder2.py +4 -4
  853. transformers/models/superglue/configuration_superglue.py +4 -0
  854. transformers/models/superglue/image_processing_superglue_fast.py +4 -3
  855. transformers/models/superglue/modeling_superglue.py +9 -4
  856. transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
  857. transformers/models/superpoint/modeling_superpoint.py +4 -2
  858. transformers/models/swin/configuration_swin.py +2 -4
  859. transformers/models/swin/modeling_swin.py +11 -8
  860. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
  861. transformers/models/swin2sr/modeling_swin2sr.py +4 -2
  862. transformers/models/swinv2/configuration_swinv2.py +2 -4
  863. transformers/models/swinv2/modeling_swinv2.py +10 -7
  864. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
  865. transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
  866. transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
  867. transformers/models/t5/configuration_t5.py +9 -8
  868. transformers/models/t5/modeling_t5.py +5 -8
  869. transformers/models/t5gemma/configuration_t5gemma.py +10 -25
  870. transformers/models/t5gemma/modeling_t5gemma.py +9 -9
  871. transformers/models/t5gemma/modular_t5gemma.py +11 -24
  872. transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
  873. transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
  874. transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
  875. transformers/models/table_transformer/configuration_table_transformer.py +18 -49
  876. transformers/models/table_transformer/modeling_table_transformer.py +27 -53
  877. transformers/models/tapas/configuration_tapas.py +12 -1
  878. transformers/models/tapas/modeling_tapas.py +1 -1
  879. transformers/models/tapas/tokenization_tapas.py +1 -0
  880. transformers/models/textnet/configuration_textnet.py +4 -6
  881. transformers/models/textnet/image_processing_textnet_fast.py +3 -3
  882. transformers/models/textnet/modeling_textnet.py +15 -14
  883. transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
  884. transformers/models/timesfm/modeling_timesfm.py +5 -6
  885. transformers/models/timesfm/modular_timesfm.py +5 -6
  886. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
  887. transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
  888. transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
  889. transformers/models/trocr/configuration_trocr.py +11 -7
  890. transformers/models/trocr/modeling_trocr.py +4 -2
  891. transformers/models/tvp/configuration_tvp.py +10 -35
  892. transformers/models/tvp/image_processing_tvp_fast.py +6 -5
  893. transformers/models/tvp/modeling_tvp.py +1 -1
  894. transformers/models/udop/configuration_udop.py +16 -7
  895. transformers/models/udop/modeling_udop.py +10 -6
  896. transformers/models/umt5/configuration_umt5.py +8 -6
  897. transformers/models/umt5/modeling_umt5.py +7 -3
  898. transformers/models/unispeech/configuration_unispeech.py +4 -1
  899. transformers/models/unispeech/modeling_unispeech.py +7 -4
  900. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
  901. transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
  902. transformers/models/upernet/configuration_upernet.py +8 -35
  903. transformers/models/upernet/modeling_upernet.py +1 -1
  904. transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
  905. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  906. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  907. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
  908. transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
  909. transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
  910. transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
  911. transformers/models/video_llava/configuration_video_llava.py +4 -0
  912. transformers/models/video_llava/modeling_video_llava.py +87 -89
  913. transformers/models/videomae/modeling_videomae.py +4 -5
  914. transformers/models/vilt/configuration_vilt.py +4 -1
  915. transformers/models/vilt/image_processing_vilt_fast.py +6 -6
  916. transformers/models/vilt/modeling_vilt.py +27 -12
  917. transformers/models/vipllava/configuration_vipllava.py +4 -0
  918. transformers/models/vipllava/modeling_vipllava.py +57 -31
  919. transformers/models/vipllava/modular_vipllava.py +50 -24
  920. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
  921. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
  922. transformers/models/visual_bert/configuration_visual_bert.py +6 -1
  923. transformers/models/vit/configuration_vit.py +2 -2
  924. transformers/models/vit/modeling_vit.py +7 -5
  925. transformers/models/vit_mae/modeling_vit_mae.py +11 -7
  926. transformers/models/vit_msn/modeling_vit_msn.py +11 -7
  927. transformers/models/vitdet/configuration_vitdet.py +2 -4
  928. transformers/models/vitdet/modeling_vitdet.py +2 -3
  929. transformers/models/vitmatte/configuration_vitmatte.py +6 -35
  930. transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
  931. transformers/models/vitmatte/modeling_vitmatte.py +1 -1
  932. transformers/models/vitpose/configuration_vitpose.py +6 -43
  933. transformers/models/vitpose/modeling_vitpose.py +5 -3
  934. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
  935. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
  936. transformers/models/vits/configuration_vits.py +4 -0
  937. transformers/models/vits/modeling_vits.py +9 -7
  938. transformers/models/vivit/modeling_vivit.py +4 -4
  939. transformers/models/vjepa2/modeling_vjepa2.py +9 -9
  940. transformers/models/voxtral/configuration_voxtral.py +0 -1
  941. transformers/models/voxtral/modeling_voxtral.py +25 -24
  942. transformers/models/voxtral/modular_voxtral.py +26 -20
  943. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
  944. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
  945. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
  946. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
  947. transformers/models/wavlm/configuration_wavlm.py +4 -1
  948. transformers/models/wavlm/modeling_wavlm.py +4 -1
  949. transformers/models/whisper/configuration_whisper.py +6 -4
  950. transformers/models/whisper/generation_whisper.py +0 -1
  951. transformers/models/whisper/modeling_whisper.py +3 -3
  952. transformers/models/x_clip/configuration_x_clip.py +4 -1
  953. transformers/models/x_clip/modeling_x_clip.py +26 -27
  954. transformers/models/xglm/configuration_xglm.py +9 -7
  955. transformers/models/xlm/configuration_xlm.py +10 -7
  956. transformers/models/xlm/modeling_xlm.py +1 -1
  957. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
  958. transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
  959. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
  960. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
  961. transformers/models/xlnet/configuration_xlnet.py +3 -1
  962. transformers/models/xlstm/configuration_xlstm.py +5 -7
  963. transformers/models/xlstm/modeling_xlstm.py +0 -32
  964. transformers/models/xmod/configuration_xmod.py +11 -2
  965. transformers/models/xmod/modeling_xmod.py +13 -16
  966. transformers/models/yolos/image_processing_yolos_fast.py +25 -28
  967. transformers/models/yolos/modeling_yolos.py +7 -7
  968. transformers/models/yolos/modular_yolos.py +16 -16
  969. transformers/models/yoso/configuration_yoso.py +8 -1
  970. transformers/models/youtu/__init__.py +27 -0
  971. transformers/models/youtu/configuration_youtu.py +194 -0
  972. transformers/models/youtu/modeling_youtu.py +619 -0
  973. transformers/models/youtu/modular_youtu.py +254 -0
  974. transformers/models/zamba/configuration_zamba.py +5 -7
  975. transformers/models/zamba/modeling_zamba.py +25 -56
  976. transformers/models/zamba2/configuration_zamba2.py +8 -13
  977. transformers/models/zamba2/modeling_zamba2.py +53 -78
  978. transformers/models/zamba2/modular_zamba2.py +36 -29
  979. transformers/models/zoedepth/configuration_zoedepth.py +17 -40
  980. transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
  981. transformers/models/zoedepth/modeling_zoedepth.py +5 -3
  982. transformers/pipelines/__init__.py +1 -61
  983. transformers/pipelines/any_to_any.py +1 -1
  984. transformers/pipelines/automatic_speech_recognition.py +0 -2
  985. transformers/pipelines/base.py +1 -1
  986. transformers/pipelines/image_text_to_text.py +1 -1
  987. transformers/pipelines/text_to_audio.py +5 -1
  988. transformers/processing_utils.py +35 -44
  989. transformers/pytorch_utils.py +2 -26
  990. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  991. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  992. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  993. transformers/quantizers/quantizer_mxfp4.py +1 -1
  994. transformers/quantizers/quantizer_torchao.py +0 -16
  995. transformers/safetensors_conversion.py +11 -4
  996. transformers/testing_utils.py +3 -28
  997. transformers/tokenization_mistral_common.py +9 -0
  998. transformers/tokenization_python.py +6 -4
  999. transformers/tokenization_utils_base.py +119 -219
  1000. transformers/tokenization_utils_tokenizers.py +31 -2
  1001. transformers/trainer.py +25 -33
  1002. transformers/trainer_seq2seq.py +1 -1
  1003. transformers/training_args.py +411 -417
  1004. transformers/utils/__init__.py +1 -4
  1005. transformers/utils/auto_docstring.py +15 -18
  1006. transformers/utils/backbone_utils.py +13 -373
  1007. transformers/utils/doc.py +4 -36
  1008. transformers/utils/generic.py +69 -33
  1009. transformers/utils/import_utils.py +72 -75
  1010. transformers/utils/loading_report.py +133 -105
  1011. transformers/utils/quantization_config.py +0 -21
  1012. transformers/video_processing_utils.py +5 -5
  1013. transformers/video_utils.py +3 -1
  1014. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
  1015. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
  1016. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1017. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1018. transformers/pipelines/image_to_text.py +0 -189
  1019. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1020. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1021. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,58 @@
1
- import torch
1
+ # Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import math
15
+ import warnings
16
+ from dataclasses import dataclass
2
17
 
3
- from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
18
+ import torch
19
+ import torch.nn as nn
20
+ import torch.nn.functional as F
21
+ from torch import Tensor
4
22
 
23
+ from ... import initialization as init
24
+ from ...backbone_utils import load_backbone
5
25
  from ...image_transforms import center_to_corners_format
26
+ from ...integrations import use_kernel_forward_from_hub
27
+ from ...modeling_outputs import BaseModelOutput
28
+ from ...modeling_utils import PreTrainedModel
29
+ from ...processing_utils import Unpack
30
+ from ...pytorch_utils import meshgrid
6
31
  from ...utils import (
32
+ ModelOutput,
7
33
  TensorType,
34
+ TransformersKwargs,
35
+ auto_docstring,
8
36
  logging,
37
+ torch_compilable_check,
38
+ )
39
+ from ...utils.generic import OutputRecorder, can_return_tuple, check_model_inputs
40
+ from ..detr.image_processing_detr_fast import DetrImageProcessorFast
41
+ from ..detr.modeling_detr import (
42
+ DetrConvEncoder,
43
+ DetrDecoderLayer,
44
+ DetrDecoderOutput,
45
+ DetrEncoder,
46
+ DetrEncoderLayer,
47
+ DetrLearnedPositionEmbedding,
48
+ DetrMLP,
49
+ DetrMLPPredictionHead,
50
+ DetrObjectDetectionOutput,
51
+ DetrSelfAttention,
52
+ DetrSinePositionEmbedding,
53
+ replace_batch_norm,
9
54
  )
55
+ from .configuration_deformable_detr import DeformableDetrConfig
10
56
 
11
57
 
12
58
  logger = logging.get_logger(__name__)
@@ -82,4 +128,1340 @@ class DeformableDetrImageProcessorFast(DetrImageProcessorFast):
82
128
  raise NotImplementedError("Panoptic segmentation post-processing is not implemented for Deformable DETR yet.")
83
129
 
84
130
 
85
- __all__ = ["DeformableDetrImageProcessorFast"]
131
+ class DeformableDetrDecoderOutput(DetrDecoderOutput):
132
+ r"""
133
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
134
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
135
+ sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
136
+ used to compute the weighted average in the cross-attention heads.
137
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
138
+ Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
139
+ layernorm.
140
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
141
+ Stacked intermediate reference points (reference points of each layer of the decoder).
142
+ """
143
+
144
+ intermediate_reference_points: torch.FloatTensor | None = None
145
+
146
+
147
+ @dataclass
148
+ @auto_docstring(
149
+ custom_intro="""
150
+ Base class for outputs of the Deformable DETR encoder-decoder model.
151
+ """
152
+ )
153
+ class DeformableDetrModelOutput(ModelOutput):
154
+ r"""
155
+ init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
156
+ Initial reference points sent through the Transformer decoder.
157
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
158
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
159
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
160
+ Stacked intermediate hidden states (output of each layer of the decoder).
161
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
162
+ Stacked intermediate reference points (reference points of each layer of the decoder).
163
+ enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
164
+ Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
165
+ picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
166
+ foreground and background).
167
+ enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
168
+ Logits of predicted bounding boxes coordinates in the first stage.
169
+ """
170
+
171
+ init_reference_points: torch.FloatTensor | None = None
172
+ last_hidden_state: torch.FloatTensor | None = None
173
+ intermediate_hidden_states: torch.FloatTensor | None = None
174
+ intermediate_reference_points: torch.FloatTensor | None = None
175
+ decoder_hidden_states: tuple[torch.FloatTensor] | None = None
176
+ decoder_attentions: tuple[torch.FloatTensor] | None = None
177
+ cross_attentions: tuple[torch.FloatTensor] | None = None
178
+ encoder_last_hidden_state: torch.FloatTensor | None = None
179
+ encoder_hidden_states: tuple[torch.FloatTensor] | None = None
180
+ encoder_attentions: tuple[torch.FloatTensor] | None = None
181
+ enc_outputs_class: torch.FloatTensor | None = None
182
+ enc_outputs_coord_logits: torch.FloatTensor | None = None
183
+
184
+
185
+ class DeformableDetrObjectDetectionOutput(DetrObjectDetectionOutput):
186
+ r"""
187
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
188
+ Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
189
+ bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
190
+ scale-invariant IoU loss.
191
+ loss_dict (`Dict`, *optional*):
192
+ A dictionary containing the individual losses. Useful for logging.
193
+ logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
194
+ Classification logits (including no-object) for all queries.
195
+ pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
196
+ Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
197
+ values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
198
+ possible padding). You can use [`~DeformableDetrProcessor.post_process_object_detection`] to retrieve the
199
+ unnormalized bounding boxes.
200
+ auxiliary_outputs (`list[Dict]`, *optional*):
201
+ Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
202
+ and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
203
+ `pred_boxes`) for each decoder layer.
204
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
205
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
206
+ init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
207
+ Initial reference points sent through the Transformer decoder.
208
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
209
+ Stacked intermediate hidden states (output of each layer of the decoder).
210
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
211
+ Stacked intermediate reference points (reference points of each layer of the decoder).
212
+ enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
213
+ Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
214
+ picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
215
+ foreground and background).
216
+ enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
217
+ Logits of predicted bounding boxes coordinates in the first stage.
218
+ """
219
+
220
+ init_reference_points: torch.FloatTensor | None = None
221
+ intermediate_hidden_states: torch.FloatTensor | None = None
222
+ intermediate_reference_points: torch.FloatTensor | None = None
223
+ enc_outputs_class: torch.FloatTensor | None = None
224
+ enc_outputs_coord_logits: torch.FloatTensor | None = None
225
+
226
+
227
+ def inverse_sigmoid(x, eps=1e-5):
228
+ x = x.clamp(min=0, max=1)
229
+ x1 = x.clamp(min=eps)
230
+ x2 = (1 - x).clamp(min=eps)
231
+ return torch.log(x1 / x2)
232
+
233
+
234
+ @use_kernel_forward_from_hub("MultiScaleDeformableAttention")
235
+ class MultiScaleDeformableAttention(nn.Module):
236
+ def forward(
237
+ self,
238
+ value: Tensor,
239
+ value_spatial_shapes: Tensor,
240
+ value_spatial_shapes_list: list[tuple],
241
+ level_start_index: Tensor,
242
+ sampling_locations: Tensor,
243
+ attention_weights: Tensor,
244
+ im2col_step: int,
245
+ ):
246
+ batch_size, _, num_heads, hidden_dim = value.shape
247
+ _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
248
+ value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1)
249
+ sampling_grids = 2 * sampling_locations - 1
250
+ sampling_value_list = []
251
+ for level_id, (height, width) in enumerate(value_spatial_shapes_list):
252
+ # batch_size, height*width, num_heads, hidden_dim
253
+ # -> batch_size, height*width, num_heads*hidden_dim
254
+ # -> batch_size, num_heads*hidden_dim, height*width
255
+ # -> batch_size*num_heads, hidden_dim, height, width
256
+ value_l_ = (
257
+ value_list[level_id]
258
+ .flatten(2)
259
+ .transpose(1, 2)
260
+ .reshape(batch_size * num_heads, hidden_dim, height, width)
261
+ )
262
+ # batch_size, num_queries, num_heads, num_points, 2
263
+ # -> batch_size, num_heads, num_queries, num_points, 2
264
+ # -> batch_size*num_heads, num_queries, num_points, 2
265
+ sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
266
+ # batch_size*num_heads, hidden_dim, num_queries, num_points
267
+ sampling_value_l_ = nn.functional.grid_sample(
268
+ value_l_,
269
+ sampling_grid_l_,
270
+ mode="bilinear",
271
+ padding_mode="zeros",
272
+ align_corners=False,
273
+ )
274
+ sampling_value_list.append(sampling_value_l_)
275
+ # (batch_size, num_queries, num_heads, num_levels, num_points)
276
+ # -> (batch_size, num_heads, num_queries, num_levels, num_points)
277
+ # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
278
+ attention_weights = attention_weights.transpose(1, 2).reshape(
279
+ batch_size * num_heads, 1, num_queries, num_levels * num_points
280
+ )
281
+ output = (
282
+ (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
283
+ .sum(-1)
284
+ .view(batch_size, num_heads * hidden_dim, num_queries)
285
+ )
286
+ return output.transpose(1, 2).contiguous()
287
+
288
+
289
+ class DeformableDetrConvEncoder(DetrConvEncoder):
290
+ def __init__(self, config):
291
+ nn.Module.__init__()
292
+
293
+ self.config = config
294
+
295
+ backbone = load_backbone(config)
296
+ self.intermediate_channel_sizes = backbone.channels
297
+
298
+ # replace batch norm by frozen batch norm
299
+ with torch.no_grad():
300
+ replace_batch_norm(backbone)
301
+
302
+ # We used to load with timm library directly instead of the AutoBackbone API
303
+ # so we need to unwrap the `backbone._backbone` module to load weights without mismatch
304
+ is_timm_model = False
305
+ if hasattr(backbone, "_backbone"):
306
+ backbone = backbone._backbone
307
+ is_timm_model = True
308
+ self.model = backbone
309
+
310
+ backbone_model_type = config.backbone_config.model_type
311
+ if "resnet" in backbone_model_type:
312
+ for name, parameter in self.model.named_parameters():
313
+ if is_timm_model:
314
+ if "layer2" not in name and "layer3" not in name and "layer4" not in name:
315
+ parameter.requires_grad_(False)
316
+ else:
317
+ if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
318
+ parameter.requires_grad_(False)
319
+
320
+
321
+ class DeformableDetrSinePositionEmbedding(DetrSinePositionEmbedding):
322
+ def forward(
323
+ self,
324
+ shape: torch.Size,
325
+ device: torch.device | str,
326
+ dtype: torch.dtype,
327
+ mask: torch.Tensor | None = None,
328
+ ) -> torch.Tensor:
329
+ if mask is None:
330
+ mask = torch.zeros((shape[0], shape[2], shape[3]), device=device, dtype=torch.bool)
331
+ y_embed = mask.cumsum(1, dtype=dtype)
332
+ x_embed = mask.cumsum(2, dtype=dtype)
333
+ if self.normalize:
334
+ eps = 1e-6
335
+ y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
336
+ x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
337
+
338
+ dim_t = torch.arange(self.num_position_features, dtype=torch.int64, device=device).to(dtype)
339
+ dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_position_features)
340
+
341
+ pos_x = x_embed[:, :, :, None] / dim_t
342
+ pos_y = y_embed[:, :, :, None] / dim_t
343
+ pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
344
+ pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
345
+ pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
346
+ # Flatten spatial dimensions and permute to (batch_size, sequence_length, hidden_size) format
347
+ # expected by the encoder
348
+ pos = pos.flatten(2).permute(0, 2, 1)
349
+ return pos
350
+
351
+
352
+ class DeformableDetrLearnedPositionEmbedding(DetrLearnedPositionEmbedding):
353
+ pass
354
+
355
+
356
+ class DeformableDetrSelfAttention(DetrSelfAttention):
357
+ pass
358
+
359
+
360
+ class DeformableDetrMultiscaleDeformableAttention(nn.Module):
361
+ """
362
+ Multiscale deformable attention as proposed in Deformable DETR.
363
+ """
364
+
365
+ def __init__(self, config: DeformableDetrConfig, num_heads: int, n_points: int):
366
+ super().__init__()
367
+
368
+ self.attn = MultiScaleDeformableAttention()
369
+
370
+ if config.d_model % num_heads != 0:
371
+ raise ValueError(
372
+ f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
373
+ )
374
+ dim_per_head = config.d_model // num_heads
375
+ # check if dim_per_head is power of 2
376
+ if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
377
+ warnings.warn(
378
+ "You'd better set embed_dim (d_model) in DeformableDetrMultiscaleDeformableAttention to make the"
379
+ " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
380
+ " implementation."
381
+ )
382
+
383
+ self.im2col_step = 64
384
+
385
+ self.d_model = config.d_model
386
+ self.n_levels = config.num_feature_levels
387
+ self.n_heads = num_heads
388
+ self.n_points = n_points
389
+
390
+ self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
391
+ self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
392
+ self.value_proj = nn.Linear(config.d_model, config.d_model)
393
+ self.output_proj = nn.Linear(config.d_model, config.d_model)
394
+
395
+ self.disable_custom_kernels = config.disable_custom_kernels
396
+
397
+ def forward(
398
+ self,
399
+ hidden_states: torch.Tensor,
400
+ attention_mask: torch.Tensor | None = None,
401
+ encoder_hidden_states=None,
402
+ encoder_attention_mask=None,
403
+ position_embeddings: torch.Tensor | None = None,
404
+ reference_points=None,
405
+ spatial_shapes=None,
406
+ spatial_shapes_list=None,
407
+ level_start_index=None,
408
+ **kwargs: Unpack[TransformersKwargs],
409
+ ) -> tuple[torch.Tensor, torch.Tensor]:
410
+ # add position embeddings to the hidden states before projecting to queries and keys
411
+ if position_embeddings is not None:
412
+ hidden_states = hidden_states + position_embeddings
413
+
414
+ batch_size, num_queries, _ = hidden_states.shape
415
+ batch_size, sequence_length, _ = encoder_hidden_states.shape
416
+ total_elements = sum(height * width for height, width in spatial_shapes_list)
417
+ torch_compilable_check(
418
+ total_elements == sequence_length,
419
+ "Make sure to align the spatial shapes with the sequence length of the encoder hidden states",
420
+ )
421
+
422
+ value = self.value_proj(encoder_hidden_states)
423
+ if attention_mask is not None:
424
+ # we invert the attention_mask
425
+ value = value.masked_fill(~attention_mask[..., None], float(0))
426
+ value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
427
+ sampling_offsets = self.sampling_offsets(hidden_states).view(
428
+ batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
429
+ )
430
+ attention_weights = self.attention_weights(hidden_states).view(
431
+ batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
432
+ )
433
+ attention_weights = F.softmax(attention_weights, -1).view(
434
+ batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
435
+ )
436
+ # batch_size, num_queries, n_heads, n_levels, n_points, 2
437
+ num_coordinates = reference_points.shape[-1]
438
+ if num_coordinates == 2:
439
+ offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
440
+ sampling_locations = (
441
+ reference_points[:, :, None, :, None, :]
442
+ + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
443
+ )
444
+ elif num_coordinates == 4:
445
+ sampling_locations = (
446
+ reference_points[:, :, None, :, None, :2]
447
+ + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
448
+ )
449
+ else:
450
+ raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
451
+
452
+ output = self.attn(
453
+ value,
454
+ spatial_shapes,
455
+ spatial_shapes_list,
456
+ level_start_index,
457
+ sampling_locations,
458
+ attention_weights,
459
+ self.im2col_step,
460
+ )
461
+
462
+ output = self.output_proj(output)
463
+
464
+ return output, attention_weights
465
+
466
+
467
+ class DeformableDetrMLP(DetrMLP):
468
+ pass
469
+
470
+
471
+ class DeformableDetrEncoderLayer(DetrEncoderLayer):
472
+ def __init__(self, config: DeformableDetrConfig):
473
+ super().__init__()
474
+ self.self_attn = DeformableDetrMultiscaleDeformableAttention(
475
+ config,
476
+ num_heads=config.encoder_attention_heads,
477
+ n_points=config.encoder_n_points,
478
+ )
479
+
480
+ def forward(
481
+ self,
482
+ hidden_states: torch.Tensor,
483
+ attention_mask: torch.Tensor,
484
+ spatial_position_embeddings: torch.Tensor | None = None,
485
+ reference_points=None,
486
+ spatial_shapes=None,
487
+ spatial_shapes_list=None,
488
+ level_start_index=None,
489
+ **kwargs: Unpack[TransformersKwargs],
490
+ ) -> torch.Tensor:
491
+ """
492
+ Args:
493
+ hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
494
+ Input to the layer.
495
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
496
+ Attention mask.
497
+ position_embeddings (`torch.FloatTensor`, *optional*):
498
+ Position embeddings, to be added to `hidden_states`.
499
+ reference_points (`torch.FloatTensor`, *optional*):
500
+ Reference points.
501
+ spatial_shapes (`torch.LongTensor`, *optional*):
502
+ Spatial shapes of the backbone feature maps.
503
+ level_start_index (`torch.LongTensor`, *optional*):
504
+ Level start index.
505
+ """
506
+ super().forward(
507
+ hidden_states=hidden_states,
508
+ attention_mask=attention_mask,
509
+ spatial_position_embeddings=spatial_position_embeddings,
510
+ **kwargs,
511
+ )
512
+ hidden_states, _ = self.self_attn(
513
+ hidden_states=hidden_states,
514
+ attention_mask=attention_mask,
515
+ encoder_hidden_states=hidden_states,
516
+ encoder_attention_mask=attention_mask,
517
+ position_embeddings=spatial_position_embeddings,
518
+ reference_points=reference_points,
519
+ spatial_shapes=spatial_shapes,
520
+ spatial_shapes_list=spatial_shapes_list,
521
+ level_start_index=level_start_index,
522
+ )
523
+
524
+
525
+ class DeformableDetrDecoderLayer(DetrDecoderLayer):
526
+ def __init__(self, config: DeformableDetrConfig):
527
+ super().__init__()
528
+ self.encoder_attn = DeformableDetrMultiscaleDeformableAttention(
529
+ config,
530
+ num_heads=config.decoder_attention_heads,
531
+ n_points=config.decoder_n_points,
532
+ )
533
+
534
+ def forward(
535
+ self,
536
+ hidden_states: torch.Tensor,
537
+ object_queries_position_embeddings: torch.Tensor | None = None,
538
+ reference_points=None,
539
+ spatial_shapes=None,
540
+ spatial_shapes_list=None,
541
+ level_start_index=None,
542
+ encoder_hidden_states: torch.Tensor | None = None,
543
+ encoder_attention_mask: torch.Tensor | None = None,
544
+ **kwargs: Unpack[TransformersKwargs],
545
+ ) -> torch.Tensor:
546
+ """
547
+ Args:
548
+ hidden_states (`torch.FloatTensor`):
549
+ Input to the layer of shape `(seq_len, batch, embed_dim)`.
550
+ position_embeddings (`torch.FloatTensor`, *optional*):
551
+ Position embeddings that are added to the queries and keys in the self-attention layer.
552
+ reference_points (`torch.FloatTensor`, *optional*):
553
+ Reference points.
554
+ spatial_shapes (`torch.LongTensor`, *optional*):
555
+ Spatial shapes.
556
+ level_start_index (`torch.LongTensor`, *optional*):
557
+ Level start index.
558
+ encoder_hidden_states (`torch.FloatTensor`):
559
+ cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
560
+ encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
561
+ `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
562
+ values.
563
+ """
564
+ residual = hidden_states
565
+
566
+ # Self Attention
567
+ hidden_states, _ = self.self_attn(
568
+ hidden_states=hidden_states,
569
+ position_embeddings=object_queries_position_embeddings,
570
+ **kwargs,
571
+ )
572
+
573
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
574
+ hidden_states = residual + hidden_states
575
+ hidden_states = self.self_attn_layer_norm(hidden_states)
576
+
577
+ residual = hidden_states
578
+
579
+ # Cross-Attention
580
+ hidden_states, _ = self.encoder_attn(
581
+ hidden_states=hidden_states,
582
+ attention_mask=encoder_attention_mask,
583
+ encoder_hidden_states=encoder_hidden_states,
584
+ encoder_attention_mask=encoder_attention_mask,
585
+ position_embeddings=object_queries_position_embeddings,
586
+ reference_points=reference_points,
587
+ spatial_shapes=spatial_shapes,
588
+ spatial_shapes_list=spatial_shapes_list,
589
+ level_start_index=level_start_index,
590
+ )
591
+
592
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
593
+ hidden_states = residual + hidden_states
594
+
595
+ hidden_states = self.encoder_attn_layer_norm(hidden_states)
596
+
597
+ # Fully Connected
598
+ residual = hidden_states
599
+ hidden_states = self.mlp(hidden_states)
600
+ hidden_states = residual + hidden_states
601
+ hidden_states = self.final_layer_norm(hidden_states)
602
+
603
+ return hidden_states
604
+
605
+
606
+ @auto_docstring
607
+ class DeformableDetrPreTrainedModel(PreTrainedModel):
608
+ config: DeformableDetrConfig
609
+ base_model_prefix = "model"
610
+ main_input_name = "pixel_values"
611
+ input_modalities = ("image",)
612
+ supports_gradient_checkpointing = True
613
+ _no_split_modules = [
614
+ r"DeformableDetrConvEncoder",
615
+ r"DeformableDetrEncoderLayer",
616
+ r"DeformableDetrDecoderLayer",
617
+ ]
618
+ _supports_sdpa = True
619
+ _supports_flash_attn = True
620
+ _supports_attention_backend = True
621
+ _supports_flex_attn = True
622
+ _keys_to_ignore_on_load_unexpected = [
623
+ r"detr\.model\.backbone\.model\.layer\d+\.0\.downsample\.1\.num_batches_tracked"
624
+ ]
625
+
626
+ @torch.no_grad()
627
+ def _init_weights(self, module):
628
+ std = self.config.init_std
629
+
630
+ if isinstance(module, DeformableDetrLearnedPositionEmbedding):
631
+ init.uniform_(module.row_embeddings.weight)
632
+ init.uniform_(module.column_embeddings.weight)
633
+ elif isinstance(module, DeformableDetrMultiscaleDeformableAttention):
634
+ init.constant_(module.sampling_offsets.weight, 0.0)
635
+ default_dtype = torch.get_default_dtype()
636
+ thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
637
+ 2.0 * math.pi / module.n_heads
638
+ )
639
+ grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
640
+ grid_init = (
641
+ (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
642
+ .view(module.n_heads, 1, 1, 2)
643
+ .repeat(1, module.n_levels, module.n_points, 1)
644
+ )
645
+ for i in range(module.n_points):
646
+ grid_init[:, :, i, :] *= i + 1
647
+
648
+ init.copy_(module.sampling_offsets.bias, grid_init.view(-1))
649
+ init.constant_(module.attention_weights.weight, 0.0)
650
+ init.constant_(module.attention_weights.bias, 0.0)
651
+ init.xavier_uniform_(module.value_proj.weight)
652
+ init.constant_(module.value_proj.bias, 0.0)
653
+ init.xavier_uniform_(module.output_proj.weight)
654
+ init.constant_(module.output_proj.bias, 0.0)
655
+ elif isinstance(module, (nn.Linear, nn.Conv2d)):
656
+ init.normal_(module.weight, mean=0.0, std=std)
657
+ if module.bias is not None:
658
+ init.zeros_(module.bias)
659
+ elif isinstance(module, nn.Embedding):
660
+ init.normal_(module.weight, mean=0.0, std=std)
661
+ # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
662
+ if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
663
+ init.zeros_(module.weight[module.padding_idx])
664
+ if hasattr(module, "reference_points") and not self.config.two_stage:
665
+ init.xavier_uniform_(module.reference_points.weight, gain=1.0)
666
+ init.constant_(module.reference_points.bias, 0.0)
667
+ if hasattr(module, "level_embed"):
668
+ init.normal_(module.level_embed)
669
+
670
+
671
+ class DeformableDetrEncoder(DetrEncoder):
672
+ """
673
+ Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
674
+ [`DeformableDetrEncoderLayer`].
675
+
676
+ The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
677
+
678
+ Args:
679
+ config: DeformableDetrConfig
680
+ """
681
+
682
+ _can_record_outputs = {
683
+ "hidden_states": DeformableDetrEncoderLayer,
684
+ "attentions": OutputRecorder(DeformableDetrMultiscaleDeformableAttention, layer_name="self_attn", index=1),
685
+ }
686
+
687
+ @check_model_inputs()
688
+ def forward(
689
+ self,
690
+ inputs_embeds=None,
691
+ attention_mask=None,
692
+ spatial_position_embeddings=None,
693
+ spatial_shapes=None,
694
+ spatial_shapes_list=None,
695
+ level_start_index=None,
696
+ valid_ratios=None,
697
+ **kwargs: Unpack[TransformersKwargs],
698
+ ):
699
+ r"""
700
+ Args:
701
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
702
+ Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
703
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
704
+ Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
705
+ - 1 for pixel features that are real (i.e. **not masked**),
706
+ - 0 for pixel features that are padding (i.e. **masked**).
707
+ [What are attention masks?](../glossary#attention-mask)
708
+ spatial_position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
709
+ Spatial position embeddings (2D positional encodings) that are added to the queries and keys in each self-attention layer.
710
+ spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
711
+ Spatial shapes of each feature map.
712
+ level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
713
+ Starting index of each feature map.
714
+ valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
715
+ Ratio of valid area in each feature level.
716
+ """
717
+ hidden_states = inputs_embeds
718
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
719
+
720
+ spatial_shapes_tuple = tuple(spatial_shapes_list)
721
+ reference_points = self.get_reference_points(spatial_shapes_tuple, valid_ratios, device=inputs_embeds.device)
722
+
723
+ for encoder_layer in self.layers:
724
+ hidden_states = encoder_layer(
725
+ hidden_states,
726
+ attention_mask,
727
+ spatial_position_embeddings=spatial_position_embeddings,
728
+ reference_points=reference_points,
729
+ spatial_shapes=spatial_shapes,
730
+ spatial_shapes_list=spatial_shapes_list,
731
+ level_start_index=level_start_index,
732
+ **kwargs,
733
+ )
734
+
735
+ return BaseModelOutput(last_hidden_state=hidden_states)
736
+
737
+ @staticmethod
738
+ def get_reference_points(spatial_shapes_list, valid_ratios, device):
739
+ """
740
+ Get reference points for each feature map. Used in decoder.
741
+
742
+ Args:
743
+ spatial_shapes_list (`list[tuple[int, int]]`):
744
+ Spatial shapes of each feature map.
745
+ valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
746
+ Valid ratios of each feature map.
747
+ device (`torch.device`):
748
+ Device on which to create the tensors.
749
+ Returns:
750
+ `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
751
+ """
752
+ reference_points_list = []
753
+ for level, (height, width) in enumerate(spatial_shapes_list):
754
+ ref_y, ref_x = meshgrid(
755
+ torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device),
756
+ torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device),
757
+ indexing="ij",
758
+ )
759
+ # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36
760
+ ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
761
+ ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
762
+ ref = torch.stack((ref_x, ref_y), -1)
763
+ reference_points_list.append(ref)
764
+ reference_points = torch.cat(reference_points_list, 1)
765
+ reference_points = reference_points[:, :, None] * valid_ratios[:, None]
766
+ return reference_points
767
+
768
+
769
+ class DeformableDetrDecoder(DeformableDetrPreTrainedModel):
770
+ """
771
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DeformableDetrDecoderLayer`].
772
+
773
+ The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
774
+
775
+ Some tweaks for Deformable DETR:
776
+
777
+ - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
778
+ - it also returns a stack of intermediate outputs and reference points from all decoding layers.
779
+
780
+ Args:
781
+ config: DeformableDetrConfig
782
+ """
783
+
784
+ _can_record_outputs = {
785
+ "hidden_states": DeformableDetrDecoderLayer,
786
+ "attentions": OutputRecorder(DeformableDetrSelfAttention, layer_name="self_attn", index=1),
787
+ "cross_attentions": OutputRecorder(
788
+ DeformableDetrMultiscaleDeformableAttention, layer_name="encoder_attn", index=1
789
+ ),
790
+ }
791
+
792
+ def __init__(self, config: DeformableDetrConfig):
793
+ super().__init__(config)
794
+
795
+ self.dropout = config.dropout
796
+ self.layers = nn.ModuleList([DeformableDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
797
+
798
+ # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
799
+ self.bbox_embed = None
800
+ self.class_embed = None
801
+
802
+ # Initialize weights and apply final processing
803
+ self.post_init()
804
+
805
+ @check_model_inputs()
806
+ def forward(
807
+ self,
808
+ inputs_embeds=None,
809
+ encoder_hidden_states=None,
810
+ encoder_attention_mask=None,
811
+ object_queries_position_embeddings=None,
812
+ reference_points=None,
813
+ spatial_shapes=None,
814
+ spatial_shapes_list=None,
815
+ level_start_index=None,
816
+ valid_ratios=None,
817
+ **kwargs: Unpack[TransformersKwargs],
818
+ ):
819
+ r"""
820
+ Args:
821
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
822
+ The query embeddings that are passed into the decoder.
823
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
824
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
825
+ of the decoder.
826
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
827
+ Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
828
+ in `[0, 1]`:
829
+ - 1 for pixels that are real (i.e. **not masked**),
830
+ - 0 for pixels that are padding (i.e. **masked**).
831
+ object_queries_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
832
+ Position embeddings for the object query slots that are added to the queries and keys in each self-attention layer.
833
+ reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
834
+ Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
835
+ spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
836
+ Spatial shapes of the feature maps.
837
+ level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
838
+ Indexes for the start of each feature level. In range `[0, sequence_length]`.
839
+ valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
840
+ Ratio of valid area in each feature level.
841
+
842
+ """
843
+ if inputs_embeds is not None:
844
+ hidden_states = inputs_embeds
845
+
846
+ # decoder layers
847
+ intermediate = ()
848
+ intermediate_reference_points = ()
849
+
850
+ for idx, decoder_layer in enumerate(self.layers):
851
+ num_coordinates = reference_points.shape[-1]
852
+ if num_coordinates == 4:
853
+ reference_points_input = (
854
+ reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
855
+ )
856
+ elif reference_points.shape[-1] == 2:
857
+ reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
858
+ else:
859
+ raise ValueError("Reference points' last dimension must be of size 2")
860
+
861
+ hidden_states = decoder_layer(
862
+ hidden_states,
863
+ object_queries_position_embeddings,
864
+ reference_points_input,
865
+ spatial_shapes,
866
+ spatial_shapes_list,
867
+ level_start_index,
868
+ encoder_hidden_states, # as a positional argument for gradient checkpointing
869
+ encoder_attention_mask,
870
+ **kwargs,
871
+ )
872
+
873
+ # hack implementation for iterative bounding box refinement
874
+ if self.bbox_embed is not None:
875
+ tmp = self.bbox_embed[idx](hidden_states)
876
+ num_coordinates = reference_points.shape[-1]
877
+ if num_coordinates == 4:
878
+ new_reference_points = tmp + inverse_sigmoid(reference_points)
879
+ new_reference_points = new_reference_points.sigmoid()
880
+ elif num_coordinates == 2:
881
+ new_reference_points = tmp
882
+ new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
883
+ new_reference_points = new_reference_points.sigmoid()
884
+ else:
885
+ raise ValueError(
886
+ f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}"
887
+ )
888
+ reference_points = new_reference_points.detach()
889
+
890
+ intermediate += (hidden_states,)
891
+ intermediate_reference_points += (reference_points,)
892
+
893
+ # Keep batch_size as first dimension
894
+ intermediate = torch.stack(intermediate, dim=1)
895
+ intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
896
+
897
+ return DeformableDetrDecoderOutput(
898
+ last_hidden_state=hidden_states,
899
+ intermediate_hidden_states=intermediate,
900
+ intermediate_reference_points=intermediate_reference_points,
901
+ )
902
+
903
+
904
+ @auto_docstring(
905
+ custom_intro="""
906
+ The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
907
+ hidden-states without any specific head on top.
908
+ """
909
+ )
910
+ class DeformableDetrModel(DeformableDetrPreTrainedModel):
911
+ def __init__(self, config: DeformableDetrConfig):
912
+ super().__init__(config)
913
+
914
+ # Create backbone
915
+ self.backbone = DeformableDetrConvEncoder(config)
916
+
917
+ # Create positional encoding
918
+ if config.position_embedding_type == "sine":
919
+ self.position_embedding = DeformableDetrSinePositionEmbedding(config.d_model // 2, normalize=True)
920
+ elif config.position_embedding_type == "learned":
921
+ self.position_embedding = DeformableDetrLearnedPositionEmbedding(config.d_model // 2)
922
+ else:
923
+ raise ValueError(f"Not supported {config.position_embedding_type}")
924
+
925
+ # Create input projection layers
926
+ if config.num_feature_levels > 1:
927
+ num_backbone_outs = len(self.backbone.intermediate_channel_sizes)
928
+ input_proj_list = []
929
+ for _ in range(num_backbone_outs):
930
+ in_channels = self.backbone.intermediate_channel_sizes[_]
931
+ input_proj_list.append(
932
+ nn.Sequential(
933
+ nn.Conv2d(in_channels, config.d_model, kernel_size=1),
934
+ nn.GroupNorm(32, config.d_model),
935
+ )
936
+ )
937
+ for _ in range(config.num_feature_levels - num_backbone_outs):
938
+ input_proj_list.append(
939
+ nn.Sequential(
940
+ nn.Conv2d(
941
+ in_channels,
942
+ config.d_model,
943
+ kernel_size=3,
944
+ stride=2,
945
+ padding=1,
946
+ ),
947
+ nn.GroupNorm(32, config.d_model),
948
+ )
949
+ )
950
+ in_channels = config.d_model
951
+ self.input_proj = nn.ModuleList(input_proj_list)
952
+ else:
953
+ self.input_proj = nn.ModuleList(
954
+ [
955
+ nn.Sequential(
956
+ nn.Conv2d(
957
+ self.backbone.intermediate_channel_sizes[-1],
958
+ config.d_model,
959
+ kernel_size=1,
960
+ ),
961
+ nn.GroupNorm(32, config.d_model),
962
+ )
963
+ ]
964
+ )
965
+
966
+ if not config.two_stage:
967
+ self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)
968
+
969
+ self.encoder = DeformableDetrEncoder(config)
970
+ self.decoder = DeformableDetrDecoder(config)
971
+
972
+ self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
973
+
974
+ if config.two_stage:
975
+ self.enc_output = nn.Linear(config.d_model, config.d_model)
976
+ self.enc_output_norm = nn.LayerNorm(config.d_model)
977
+ self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
978
+ self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
979
+ else:
980
+ self.reference_points = nn.Linear(config.d_model, 2)
981
+
982
+ self.post_init()
983
+
984
+ def freeze_backbone(self):
985
+ for name, param in self.backbone.model.named_parameters():
986
+ param.requires_grad_(False)
987
+
988
+ def unfreeze_backbone(self):
989
+ for name, param in self.backbone.model.named_parameters():
990
+ param.requires_grad_(True)
991
+
992
+ def get_valid_ratio(self, mask, dtype=torch.float32):
993
+ """Get the valid ratio of all feature maps."""
994
+
995
+ _, height, width = mask.shape
996
+ valid_height = torch.sum(mask[:, :, 0], 1)
997
+ valid_width = torch.sum(mask[:, 0, :], 1)
998
+ valid_ratio_height = valid_height.to(dtype) / height
999
+ valid_ratio_width = valid_width.to(dtype) / width
1000
+ valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
1001
+ return valid_ratio
1002
+
1003
+ def get_proposal_pos_embed(self, proposals):
1004
+ """Get the position embedding of the proposals."""
1005
+
1006
+ num_pos_feats = self.config.d_model // 2
1007
+ temperature = 10000
1008
+ scale = 2 * math.pi
1009
+
1010
+ # Compute position embeddings in float32 to avoid overflow with large temperature values in fp16
1011
+ proposals_dtype = proposals.dtype
1012
+ dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
1013
+ dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
1014
+ # batch_size, num_queries, 4
1015
+ proposals = proposals.sigmoid().to(torch.float32) * scale
1016
+ # batch_size, num_queries, 4, 128
1017
+ pos = proposals[:, :, :, None] / dim_t
1018
+ # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
1019
+ pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
1020
+ # Convert back to target dtype after all computations are done
1021
+ return pos.to(proposals_dtype)
1022
+
1023
+ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
1024
+ """Generate the encoder output proposals from encoded enc_output.
1025
+
1026
+ Args:
1027
+ enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
1028
+ padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
1029
+ spatial_shapes (list[tuple[int, int]]): Spatial shapes of the feature maps.
1030
+
1031
+ Returns:
1032
+ `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
1033
+ - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
1034
+ directly predict a bounding box. (without the need of a decoder)
1035
+ - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
1036
+ sigmoid.
1037
+ """
1038
+ batch_size = enc_output.shape[0]
1039
+ proposals = []
1040
+ _cur = 0
1041
+ for level, (height, width) in enumerate(spatial_shapes):
1042
+ mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)
1043
+ valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
1044
+ valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
1045
+
1046
+ grid_y, grid_x = meshgrid(
1047
+ torch.linspace(
1048
+ 0,
1049
+ height - 1,
1050
+ height,
1051
+ dtype=enc_output.dtype,
1052
+ device=enc_output.device,
1053
+ ),
1054
+ torch.linspace(
1055
+ 0,
1056
+ width - 1,
1057
+ width,
1058
+ dtype=enc_output.dtype,
1059
+ device=enc_output.device,
1060
+ ),
1061
+ indexing="ij",
1062
+ )
1063
+ grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
1064
+
1065
+ scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
1066
+ grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
1067
+ width_height = torch.ones_like(grid) * 0.05 * (2.0**level)
1068
+ proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4)
1069
+ proposals.append(proposal)
1070
+ _cur += height * width
1071
+ output_proposals = torch.cat(proposals, 1)
1072
+ output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
1073
+ output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid
1074
+ output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
1075
+ output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
1076
+
1077
+ # assign each pixel as an object query
1078
+ object_query = enc_output
1079
+ object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
1080
+ object_query = object_query.masked_fill(~output_proposals_valid, float(0))
1081
+ object_query = self.enc_output_norm(self.enc_output(object_query))
1082
+ return object_query, output_proposals
1083
+
1084
+ @auto_docstring
1085
+ @can_return_tuple
1086
+ def forward(
1087
+ self,
1088
+ pixel_values: torch.FloatTensor,
1089
+ pixel_mask: torch.LongTensor | None = None,
1090
+ decoder_attention_mask: torch.FloatTensor | None = None,
1091
+ encoder_outputs: torch.FloatTensor | None = None,
1092
+ inputs_embeds: torch.FloatTensor | None = None,
1093
+ decoder_inputs_embeds: torch.FloatTensor | None = None,
1094
+ **kwargs: Unpack[TransformersKwargs],
1095
+ ) -> tuple[torch.FloatTensor] | DeformableDetrModelOutput:
1096
+ r"""
1097
+ decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
1098
+ Not used by default. Can be used to mask object queries.
1099
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1100
+ Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
1101
+ can choose to directly pass a flattened representation of an image.
1102
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
1103
+ Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
1104
+ embedded representation.
1105
+
1106
+ Examples:
1107
+
1108
+ ```python
1109
+ >>> from transformers import AutoImageProcessor, DeformableDetrModel
1110
+ >>> from PIL import Image
1111
+ >>> import requests
1112
+
1113
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1114
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1115
+
1116
+ >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
1117
+ >>> model = DeformableDetrModel.from_pretrained("SenseTime/deformable-detr")
1118
+
1119
+ >>> inputs = image_processor(images=image, return_tensors="pt")
1120
+
1121
+ >>> outputs = model(**inputs)
1122
+
1123
+ >>> last_hidden_states = outputs.last_hidden_state
1124
+ >>> list(last_hidden_states.shape)
1125
+ [1, 300, 256]
1126
+ ```"""
1127
+ batch_size, num_channels, height, width = pixel_values.shape
1128
+ device = pixel_values.device
1129
+
1130
+ if pixel_mask is None:
1131
+ pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
1132
+
1133
+ # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
1134
+ # First, sent pixel_values + pixel_mask through Backbone to obtain the features
1135
+ # which is a list of tuples
1136
+ features = self.backbone(pixel_values, pixel_mask)
1137
+
1138
+ # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
1139
+ sources = []
1140
+ masks = []
1141
+ position_embeddings_list = []
1142
+ for level, (source, mask) in enumerate(features):
1143
+ sources.append(self.input_proj[level](source))
1144
+ masks.append(mask)
1145
+ if mask is None:
1146
+ raise ValueError("No attention mask was provided")
1147
+ # Generate position embeddings for this feature level
1148
+ pos = self.position_embedding(shape=source.shape, device=device, dtype=pixel_values.dtype, mask=mask).to(
1149
+ source.dtype
1150
+ )
1151
+ position_embeddings_list.append(pos)
1152
+
1153
+ # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
1154
+ if self.config.num_feature_levels > len(sources):
1155
+ _len_sources = len(sources)
1156
+ for level in range(_len_sources, self.config.num_feature_levels):
1157
+ if level == _len_sources:
1158
+ source = self.input_proj[level](features[-1][0])
1159
+ else:
1160
+ source = self.input_proj[level](sources[-1])
1161
+ mask = nn.functional.interpolate(pixel_mask[None].to(pixel_values.dtype), size=source.shape[-2:]).to(
1162
+ torch.bool
1163
+ )[0]
1164
+ pos_l = self.position_embedding(
1165
+ shape=source.shape, device=device, dtype=pixel_values.dtype, mask=mask
1166
+ ).to(source.dtype)
1167
+ sources.append(source)
1168
+ masks.append(mask)
1169
+ position_embeddings_list.append(pos_l)
1170
+
1171
+ # Create queries
1172
+ query_embeds = None
1173
+ if not self.config.two_stage:
1174
+ query_embeds = self.query_position_embeddings.weight
1175
+
1176
+ # Prepare encoder inputs (by flattening)
1177
+ source_flatten = []
1178
+ mask_flatten = []
1179
+ lvl_pos_embed_flatten = []
1180
+ spatial_shapes_list = []
1181
+ for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)):
1182
+ batch_size, num_channels, height, width = source.shape
1183
+ spatial_shape = (height, width)
1184
+ spatial_shapes_list.append(spatial_shape)
1185
+ source = source.flatten(2).transpose(1, 2)
1186
+ mask = mask.flatten(1)
1187
+ lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
1188
+ lvl_pos_embed_flatten.append(lvl_pos_embed)
1189
+ source_flatten.append(source)
1190
+ mask_flatten.append(mask)
1191
+ source_flatten = torch.cat(source_flatten, 1)
1192
+ mask_flatten = torch.cat(mask_flatten, 1)
1193
+ lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
1194
+ spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
1195
+ level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
1196
+ valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1)
1197
+
1198
+ # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
1199
+ # Also provide spatial_shapes, level_start_index and valid_ratios
1200
+ if encoder_outputs is None:
1201
+ encoder_outputs = self.encoder(
1202
+ inputs_embeds=source_flatten,
1203
+ attention_mask=mask_flatten,
1204
+ spatial_position_embeddings=lvl_pos_embed_flatten,
1205
+ spatial_shapes=spatial_shapes,
1206
+ spatial_shapes_list=spatial_shapes_list,
1207
+ level_start_index=level_start_index,
1208
+ valid_ratios=valid_ratios,
1209
+ **kwargs,
1210
+ )
1211
+
1212
+ # Fifth, prepare decoder inputs
1213
+ batch_size, _, num_channels = encoder_outputs[0].shape
1214
+ enc_outputs_class = None
1215
+ enc_outputs_coord_logits = None
1216
+ if self.config.two_stage:
1217
+ object_query_embedding, output_proposals = self.gen_encoder_output_proposals(
1218
+ encoder_outputs[0], ~mask_flatten, spatial_shapes_list
1219
+ )
1220
+
1221
+ # hack implementation for two-stage Deformable DETR
1222
+ # apply a detection head to each pixel (A.4 in paper)
1223
+ # linear projection for bounding box binary classification (i.e. foreground and background)
1224
+ enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
1225
+ # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
1226
+ delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
1227
+ enc_outputs_coord_logits = delta_bbox + output_proposals
1228
+
1229
+ # only keep top scoring `config.two_stage_num_proposals` proposals
1230
+ topk = self.config.two_stage_num_proposals
1231
+ topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
1232
+ topk_coords_logits = torch.gather(
1233
+ enc_outputs_coord_logits,
1234
+ 1,
1235
+ topk_proposals.unsqueeze(-1).repeat(1, 1, 4),
1236
+ )
1237
+
1238
+ topk_coords_logits = topk_coords_logits.detach()
1239
+ reference_points = topk_coords_logits.sigmoid()
1240
+ init_reference_points = reference_points
1241
+ pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits)))
1242
+ query_embed, target = torch.split(pos_trans_out, num_channels, dim=2)
1243
+ else:
1244
+ query_embed, target = torch.split(query_embeds, num_channels, dim=1)
1245
+ query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1)
1246
+ target = target.unsqueeze(0).expand(batch_size, -1, -1)
1247
+ reference_points = self.reference_points(query_embed).sigmoid()
1248
+ init_reference_points = reference_points
1249
+
1250
+ decoder_outputs = self.decoder(
1251
+ inputs_embeds=target,
1252
+ object_queries_position_embeddings=query_embed,
1253
+ encoder_hidden_states=encoder_outputs[0],
1254
+ encoder_attention_mask=mask_flatten,
1255
+ reference_points=reference_points,
1256
+ spatial_shapes=spatial_shapes,
1257
+ spatial_shapes_list=spatial_shapes_list,
1258
+ level_start_index=level_start_index,
1259
+ valid_ratios=valid_ratios,
1260
+ **kwargs,
1261
+ )
1262
+
1263
+ return DeformableDetrModelOutput(
1264
+ init_reference_points=init_reference_points,
1265
+ last_hidden_state=decoder_outputs.last_hidden_state,
1266
+ intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
1267
+ intermediate_reference_points=decoder_outputs.intermediate_reference_points,
1268
+ decoder_hidden_states=decoder_outputs.hidden_states,
1269
+ decoder_attentions=decoder_outputs.attentions,
1270
+ cross_attentions=decoder_outputs.cross_attentions,
1271
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
1272
+ encoder_hidden_states=encoder_outputs.hidden_states,
1273
+ encoder_attentions=encoder_outputs.attentions,
1274
+ enc_outputs_class=enc_outputs_class,
1275
+ enc_outputs_coord_logits=enc_outputs_coord_logits,
1276
+ )
1277
+
1278
+
1279
+ class DeformableDetrMLPPredictionHead(DetrMLPPredictionHead):
1280
+ pass
1281
+
1282
+
1283
+ @auto_docstring(
1284
+ custom_intro="""
1285
+ Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
1286
+ top, for tasks such as COCO detection.
1287
+ """
1288
+ )
1289
+ class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
1290
+ # When using clones, all layers > 0 will be clones, but layer 0 *is* required
1291
+ # We can't initialize the model on meta device as some weights are modified during the initialization
1292
+ _no_split_modules = None
1293
+ _tied_weights_keys = {
1294
+ r"bbox_embed.(?![0])\d+": "bbox_embed.0",
1295
+ r"class_embed.(?![0])\d+": "class_embed.0",
1296
+ }
1297
+
1298
+ def __init__(self, config: DeformableDetrConfig):
1299
+ super().__init__(config)
1300
+ # Deformable DETR encoder-decoder model
1301
+ self.model = DeformableDetrModel(config)
1302
+ # Detection heads on top
1303
+ # if two-stage, the last class_embed and bbox_embed is for region proposal generation
1304
+ num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
1305
+ self.class_embed = nn.ModuleList([nn.Linear(config.d_model, config.num_labels) for _ in range(num_pred)])
1306
+ self.bbox_embed = nn.ModuleList(
1307
+ [
1308
+ DeformableDetrMLPPredictionHead(
1309
+ input_dim=config.d_model,
1310
+ hidden_dim=config.d_model,
1311
+ output_dim=4,
1312
+ num_layers=3,
1313
+ )
1314
+ for _ in range(num_pred)
1315
+ ]
1316
+ )
1317
+ # Convert to instance attribute before modifying
1318
+ self._tied_weights_keys = self._tied_weights_keys.copy()
1319
+ if config.with_box_refine:
1320
+ self.model.decoder.bbox_embed = self.bbox_embed
1321
+ self._tied_weights_keys["bbox_embed"] = "model.decoder.bbox_embed"
1322
+ if config.two_stage:
1323
+ self.model.decoder.class_embed = self.class_embed
1324
+ self._tied_weights_keys["class_embed"] = "model.decoder.class_embed"
1325
+ self.post_init()
1326
+
1327
+ @auto_docstring
1328
+ @can_return_tuple
1329
+ def forward(
1330
+ self,
1331
+ pixel_values: torch.FloatTensor,
1332
+ pixel_mask: torch.LongTensor | None = None,
1333
+ decoder_attention_mask: torch.FloatTensor | None = None,
1334
+ encoder_outputs: torch.FloatTensor | None = None,
1335
+ inputs_embeds: torch.FloatTensor | None = None,
1336
+ decoder_inputs_embeds: torch.FloatTensor | None = None,
1337
+ labels: list[dict] | None = None,
1338
+ **kwargs: Unpack[TransformersKwargs],
1339
+ ) -> tuple[torch.FloatTensor] | DeformableDetrObjectDetectionOutput:
1340
+ r"""
1341
+ decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
1342
+ Not used by default. Can be used to mask object queries.
1343
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1344
+ Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
1345
+ can choose to directly pass a flattened representation of an image.
1346
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
1347
+ Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
1348
+ embedded representation.
1349
+ labels (`list[Dict]` of len `(batch_size,)`, *optional*):
1350
+ Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
1351
+ following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
1352
+ respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
1353
+ in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
1354
+
1355
+ Examples:
1356
+
1357
+ ```python
1358
+ >>> from transformers import AutoImageProcessor, DeformableDetrForObjectDetection
1359
+ >>> from PIL import Image
1360
+ >>> import requests
1361
+
1362
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1363
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1364
+
1365
+ >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
1366
+ >>> model = DeformableDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr")
1367
+
1368
+ >>> inputs = image_processor(images=image, return_tensors="pt")
1369
+ >>> outputs = model(**inputs)
1370
+
1371
+ >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
1372
+ >>> target_sizes = torch.tensor([image.size[::-1]])
1373
+ >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
1374
+ ... 0
1375
+ ... ]
1376
+ >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
1377
+ ... box = [round(i, 2) for i in box.tolist()]
1378
+ ... print(
1379
+ ... f"Detected {model.config.id2label[label.item()]} with confidence "
1380
+ ... f"{round(score.item(), 3)} at location {box}"
1381
+ ... )
1382
+ Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78]
1383
+ Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25]
1384
+ Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25]
1385
+ ```"""
1386
+ # First, sent images through DETR base model to obtain encoder + decoder outputs
1387
+ outputs = self.model(
1388
+ pixel_values,
1389
+ pixel_mask=pixel_mask,
1390
+ decoder_attention_mask=decoder_attention_mask,
1391
+ encoder_outputs=encoder_outputs,
1392
+ inputs_embeds=inputs_embeds,
1393
+ decoder_inputs_embeds=decoder_inputs_embeds,
1394
+ **kwargs,
1395
+ )
1396
+
1397
+ hidden_states = outputs.intermediate_hidden_states
1398
+ init_reference = outputs.init_reference_points
1399
+ inter_references = outputs.intermediate_reference_points
1400
+
1401
+ # class logits + predicted bounding boxes
1402
+ outputs_classes = []
1403
+ outputs_coords = []
1404
+
1405
+ for level in range(hidden_states.shape[1]):
1406
+ if level == 0:
1407
+ reference = init_reference
1408
+ else:
1409
+ reference = inter_references[:, level - 1]
1410
+ reference = inverse_sigmoid(reference)
1411
+ outputs_class = self.class_embed[level](hidden_states[:, level])
1412
+ delta_bbox = self.bbox_embed[level](hidden_states[:, level])
1413
+ if reference.shape[-1] == 4:
1414
+ outputs_coord_logits = delta_bbox + reference
1415
+ elif reference.shape[-1] == 2:
1416
+ delta_bbox[..., :2] += reference
1417
+ outputs_coord_logits = delta_bbox
1418
+ else:
1419
+ raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
1420
+ outputs_coord = outputs_coord_logits.sigmoid()
1421
+ outputs_classes.append(outputs_class)
1422
+ outputs_coords.append(outputs_coord)
1423
+ outputs_class = torch.stack(outputs_classes)
1424
+ outputs_coord = torch.stack(outputs_coords)
1425
+
1426
+ logits = outputs_class[-1]
1427
+ pred_boxes = outputs_coord[-1]
1428
+
1429
+ loss, loss_dict, auxiliary_outputs = None, None, None
1430
+ if labels is not None:
1431
+ loss, loss_dict, auxiliary_outputs = self.loss_function(
1432
+ logits,
1433
+ labels,
1434
+ self.device,
1435
+ pred_boxes,
1436
+ self.config,
1437
+ outputs_class,
1438
+ outputs_coord,
1439
+ )
1440
+
1441
+ return DeformableDetrObjectDetectionOutput(
1442
+ loss=loss,
1443
+ loss_dict=loss_dict,
1444
+ logits=logits,
1445
+ pred_boxes=pred_boxes,
1446
+ auxiliary_outputs=auxiliary_outputs,
1447
+ last_hidden_state=outputs.last_hidden_state,
1448
+ decoder_hidden_states=outputs.decoder_hidden_states,
1449
+ decoder_attentions=outputs.decoder_attentions,
1450
+ cross_attentions=outputs.cross_attentions,
1451
+ encoder_last_hidden_state=outputs.encoder_last_hidden_state,
1452
+ encoder_hidden_states=outputs.encoder_hidden_states,
1453
+ encoder_attentions=outputs.encoder_attentions,
1454
+ intermediate_hidden_states=outputs.intermediate_hidden_states,
1455
+ intermediate_reference_points=outputs.intermediate_reference_points,
1456
+ init_reference_points=outputs.init_reference_points,
1457
+ enc_outputs_class=outputs.enc_outputs_class,
1458
+ enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
1459
+ )
1460
+
1461
+
1462
+ __all__ = [
1463
+ "DeformableDetrImageProcessorFast",
1464
+ "DeformableDetrForObjectDetection",
1465
+ "DeformableDetrModel",
1466
+ "DeformableDetrPreTrainedModel",
1467
+ ]