transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1021) hide show
  1. transformers/__init__.py +4 -11
  2. transformers/activations.py +2 -2
  3. transformers/backbone_utils.py +326 -0
  4. transformers/cache_utils.py +11 -2
  5. transformers/cli/serve.py +11 -8
  6. transformers/configuration_utils.py +1 -69
  7. transformers/conversion_mapping.py +146 -26
  8. transformers/convert_slow_tokenizer.py +6 -4
  9. transformers/core_model_loading.py +207 -118
  10. transformers/dependency_versions_check.py +0 -1
  11. transformers/dependency_versions_table.py +7 -8
  12. transformers/file_utils.py +0 -2
  13. transformers/generation/candidate_generator.py +1 -2
  14. transformers/generation/continuous_batching/cache.py +40 -38
  15. transformers/generation/continuous_batching/cache_manager.py +3 -16
  16. transformers/generation/continuous_batching/continuous_api.py +94 -406
  17. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  18. transformers/generation/continuous_batching/requests.py +54 -17
  19. transformers/generation/continuous_batching/scheduler.py +77 -95
  20. transformers/generation/logits_process.py +10 -5
  21. transformers/generation/stopping_criteria.py +1 -2
  22. transformers/generation/utils.py +75 -95
  23. transformers/image_processing_utils.py +0 -3
  24. transformers/image_processing_utils_fast.py +17 -18
  25. transformers/image_transforms.py +44 -13
  26. transformers/image_utils.py +0 -5
  27. transformers/initialization.py +57 -0
  28. transformers/integrations/__init__.py +10 -24
  29. transformers/integrations/accelerate.py +47 -11
  30. transformers/integrations/deepspeed.py +145 -3
  31. transformers/integrations/executorch.py +2 -6
  32. transformers/integrations/finegrained_fp8.py +142 -7
  33. transformers/integrations/flash_attention.py +2 -7
  34. transformers/integrations/hub_kernels.py +18 -7
  35. transformers/integrations/moe.py +226 -106
  36. transformers/integrations/mxfp4.py +47 -34
  37. transformers/integrations/peft.py +488 -176
  38. transformers/integrations/tensor_parallel.py +641 -581
  39. transformers/masking_utils.py +153 -9
  40. transformers/modeling_flash_attention_utils.py +1 -2
  41. transformers/modeling_utils.py +359 -358
  42. transformers/models/__init__.py +6 -0
  43. transformers/models/afmoe/configuration_afmoe.py +14 -4
  44. transformers/models/afmoe/modeling_afmoe.py +8 -8
  45. transformers/models/afmoe/modular_afmoe.py +7 -7
  46. transformers/models/aimv2/configuration_aimv2.py +2 -7
  47. transformers/models/aimv2/modeling_aimv2.py +26 -24
  48. transformers/models/aimv2/modular_aimv2.py +8 -12
  49. transformers/models/albert/configuration_albert.py +8 -1
  50. transformers/models/albert/modeling_albert.py +3 -3
  51. transformers/models/align/configuration_align.py +8 -5
  52. transformers/models/align/modeling_align.py +22 -24
  53. transformers/models/altclip/configuration_altclip.py +4 -6
  54. transformers/models/altclip/modeling_altclip.py +30 -26
  55. transformers/models/apertus/configuration_apertus.py +5 -7
  56. transformers/models/apertus/modeling_apertus.py +4 -4
  57. transformers/models/apertus/modular_apertus.py +8 -10
  58. transformers/models/arcee/configuration_arcee.py +5 -7
  59. transformers/models/arcee/modeling_arcee.py +4 -4
  60. transformers/models/aria/configuration_aria.py +11 -21
  61. transformers/models/aria/modeling_aria.py +39 -36
  62. transformers/models/aria/modular_aria.py +33 -39
  63. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
  64. transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
  65. transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
  66. transformers/models/auto/auto_factory.py +8 -6
  67. transformers/models/auto/configuration_auto.py +22 -0
  68. transformers/models/auto/image_processing_auto.py +17 -13
  69. transformers/models/auto/modeling_auto.py +15 -0
  70. transformers/models/auto/processing_auto.py +9 -18
  71. transformers/models/auto/tokenization_auto.py +17 -15
  72. transformers/models/autoformer/modeling_autoformer.py +2 -1
  73. transformers/models/aya_vision/configuration_aya_vision.py +4 -0
  74. transformers/models/aya_vision/modeling_aya_vision.py +29 -62
  75. transformers/models/aya_vision/modular_aya_vision.py +20 -45
  76. transformers/models/bamba/configuration_bamba.py +17 -7
  77. transformers/models/bamba/modeling_bamba.py +23 -55
  78. transformers/models/bamba/modular_bamba.py +19 -54
  79. transformers/models/bark/configuration_bark.py +2 -1
  80. transformers/models/bark/modeling_bark.py +24 -10
  81. transformers/models/bart/configuration_bart.py +9 -4
  82. transformers/models/bart/modeling_bart.py +9 -12
  83. transformers/models/beit/configuration_beit.py +2 -4
  84. transformers/models/beit/image_processing_beit_fast.py +3 -3
  85. transformers/models/beit/modeling_beit.py +14 -9
  86. transformers/models/bert/configuration_bert.py +12 -1
  87. transformers/models/bert/modeling_bert.py +6 -30
  88. transformers/models/bert_generation/configuration_bert_generation.py +17 -1
  89. transformers/models/bert_generation/modeling_bert_generation.py +6 -6
  90. transformers/models/big_bird/configuration_big_bird.py +12 -8
  91. transformers/models/big_bird/modeling_big_bird.py +0 -15
  92. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
  93. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
  94. transformers/models/biogpt/configuration_biogpt.py +8 -1
  95. transformers/models/biogpt/modeling_biogpt.py +4 -8
  96. transformers/models/biogpt/modular_biogpt.py +1 -5
  97. transformers/models/bit/configuration_bit.py +2 -4
  98. transformers/models/bit/modeling_bit.py +6 -5
  99. transformers/models/bitnet/configuration_bitnet.py +5 -7
  100. transformers/models/bitnet/modeling_bitnet.py +3 -4
  101. transformers/models/bitnet/modular_bitnet.py +3 -4
  102. transformers/models/blenderbot/configuration_blenderbot.py +8 -4
  103. transformers/models/blenderbot/modeling_blenderbot.py +4 -4
  104. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
  105. transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
  106. transformers/models/blip/configuration_blip.py +9 -9
  107. transformers/models/blip/modeling_blip.py +55 -37
  108. transformers/models/blip_2/configuration_blip_2.py +2 -1
  109. transformers/models/blip_2/modeling_blip_2.py +81 -56
  110. transformers/models/bloom/configuration_bloom.py +5 -1
  111. transformers/models/bloom/modeling_bloom.py +2 -1
  112. transformers/models/blt/configuration_blt.py +23 -12
  113. transformers/models/blt/modeling_blt.py +20 -14
  114. transformers/models/blt/modular_blt.py +70 -10
  115. transformers/models/bridgetower/configuration_bridgetower.py +7 -1
  116. transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
  117. transformers/models/bridgetower/modeling_bridgetower.py +29 -15
  118. transformers/models/bros/configuration_bros.py +24 -17
  119. transformers/models/camembert/configuration_camembert.py +8 -1
  120. transformers/models/camembert/modeling_camembert.py +6 -6
  121. transformers/models/canine/configuration_canine.py +4 -1
  122. transformers/models/chameleon/configuration_chameleon.py +5 -7
  123. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
  124. transformers/models/chameleon/modeling_chameleon.py +82 -36
  125. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
  126. transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
  127. transformers/models/clap/configuration_clap.py +4 -8
  128. transformers/models/clap/modeling_clap.py +21 -22
  129. transformers/models/clip/configuration_clip.py +4 -1
  130. transformers/models/clip/image_processing_clip_fast.py +9 -0
  131. transformers/models/clip/modeling_clip.py +25 -22
  132. transformers/models/clipseg/configuration_clipseg.py +4 -1
  133. transformers/models/clipseg/modeling_clipseg.py +27 -25
  134. transformers/models/clipseg/processing_clipseg.py +11 -3
  135. transformers/models/clvp/configuration_clvp.py +14 -2
  136. transformers/models/clvp/modeling_clvp.py +19 -30
  137. transformers/models/codegen/configuration_codegen.py +4 -3
  138. transformers/models/codegen/modeling_codegen.py +2 -1
  139. transformers/models/cohere/configuration_cohere.py +5 -7
  140. transformers/models/cohere/modeling_cohere.py +4 -4
  141. transformers/models/cohere/modular_cohere.py +3 -3
  142. transformers/models/cohere2/configuration_cohere2.py +6 -8
  143. transformers/models/cohere2/modeling_cohere2.py +4 -4
  144. transformers/models/cohere2/modular_cohere2.py +9 -11
  145. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  146. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
  147. transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
  148. transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
  149. transformers/models/colqwen2/modeling_colqwen2.py +7 -6
  150. transformers/models/colqwen2/modular_colqwen2.py +7 -6
  151. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
  152. transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
  153. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
  154. transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
  155. transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
  156. transformers/models/convbert/configuration_convbert.py +11 -7
  157. transformers/models/convnext/configuration_convnext.py +2 -4
  158. transformers/models/convnext/image_processing_convnext_fast.py +2 -2
  159. transformers/models/convnext/modeling_convnext.py +7 -6
  160. transformers/models/convnextv2/configuration_convnextv2.py +2 -4
  161. transformers/models/convnextv2/modeling_convnextv2.py +7 -6
  162. transformers/models/cpmant/configuration_cpmant.py +4 -0
  163. transformers/models/csm/configuration_csm.py +9 -15
  164. transformers/models/csm/modeling_csm.py +3 -3
  165. transformers/models/ctrl/configuration_ctrl.py +16 -0
  166. transformers/models/ctrl/modeling_ctrl.py +13 -25
  167. transformers/models/cwm/configuration_cwm.py +5 -7
  168. transformers/models/cwm/modeling_cwm.py +4 -4
  169. transformers/models/d_fine/configuration_d_fine.py +10 -56
  170. transformers/models/d_fine/modeling_d_fine.py +728 -868
  171. transformers/models/d_fine/modular_d_fine.py +335 -412
  172. transformers/models/dab_detr/configuration_dab_detr.py +22 -48
  173. transformers/models/dab_detr/modeling_dab_detr.py +11 -7
  174. transformers/models/dac/modeling_dac.py +1 -1
  175. transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
  176. transformers/models/data2vec/configuration_data2vec_text.py +11 -2
  177. transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
  178. transformers/models/data2vec/modeling_data2vec_text.py +6 -6
  179. transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
  180. transformers/models/dbrx/configuration_dbrx.py +11 -3
  181. transformers/models/dbrx/modeling_dbrx.py +6 -6
  182. transformers/models/dbrx/modular_dbrx.py +6 -6
  183. transformers/models/deberta/configuration_deberta.py +6 -0
  184. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
  185. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
  186. transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
  187. transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
  188. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
  189. transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
  190. transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
  191. transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
  192. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
  193. transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
  194. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
  195. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
  196. transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
  197. transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
  198. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
  199. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
  200. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
  201. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
  202. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
  203. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
  204. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
  205. transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
  206. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
  207. transformers/models/deit/modeling_deit.py +11 -7
  208. transformers/models/depth_anything/configuration_depth_anything.py +12 -42
  209. transformers/models/depth_anything/modeling_depth_anything.py +5 -3
  210. transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
  211. transformers/models/depth_pro/modeling_depth_pro.py +8 -4
  212. transformers/models/detr/configuration_detr.py +18 -49
  213. transformers/models/detr/image_processing_detr_fast.py +11 -11
  214. transformers/models/detr/modeling_detr.py +695 -734
  215. transformers/models/dia/configuration_dia.py +4 -7
  216. transformers/models/dia/generation_dia.py +8 -17
  217. transformers/models/dia/modeling_dia.py +7 -7
  218. transformers/models/dia/modular_dia.py +4 -4
  219. transformers/models/diffllama/configuration_diffllama.py +5 -7
  220. transformers/models/diffllama/modeling_diffllama.py +3 -8
  221. transformers/models/diffllama/modular_diffllama.py +2 -7
  222. transformers/models/dinat/configuration_dinat.py +2 -4
  223. transformers/models/dinat/modeling_dinat.py +7 -6
  224. transformers/models/dinov2/configuration_dinov2.py +2 -4
  225. transformers/models/dinov2/modeling_dinov2.py +9 -8
  226. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
  227. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
  228. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
  229. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
  230. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
  231. transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
  232. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
  233. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
  234. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
  235. transformers/models/distilbert/configuration_distilbert.py +8 -1
  236. transformers/models/distilbert/modeling_distilbert.py +3 -3
  237. transformers/models/doge/configuration_doge.py +17 -7
  238. transformers/models/doge/modeling_doge.py +4 -4
  239. transformers/models/doge/modular_doge.py +20 -10
  240. transformers/models/donut/image_processing_donut_fast.py +4 -4
  241. transformers/models/dots1/configuration_dots1.py +16 -7
  242. transformers/models/dots1/modeling_dots1.py +4 -4
  243. transformers/models/dpr/configuration_dpr.py +19 -1
  244. transformers/models/dpt/configuration_dpt.py +23 -65
  245. transformers/models/dpt/image_processing_dpt_fast.py +5 -5
  246. transformers/models/dpt/modeling_dpt.py +19 -15
  247. transformers/models/dpt/modular_dpt.py +4 -4
  248. transformers/models/edgetam/configuration_edgetam.py +1 -1
  249. transformers/models/edgetam/modeling_edgetam.py +53 -53
  250. transformers/models/edgetam/modular_edgetam.py +5 -7
  251. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
  252. transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
  253. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
  254. transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
  255. transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
  256. transformers/models/electra/configuration_electra.py +13 -2
  257. transformers/models/electra/modeling_electra.py +6 -6
  258. transformers/models/emu3/configuration_emu3.py +12 -10
  259. transformers/models/emu3/modeling_emu3.py +84 -47
  260. transformers/models/emu3/modular_emu3.py +77 -39
  261. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
  262. transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
  263. transformers/models/eomt/configuration_eomt.py +12 -13
  264. transformers/models/eomt/image_processing_eomt_fast.py +3 -3
  265. transformers/models/eomt/modeling_eomt.py +3 -3
  266. transformers/models/eomt/modular_eomt.py +17 -17
  267. transformers/models/eomt_dinov3/__init__.py +28 -0
  268. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  269. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  270. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  271. transformers/models/ernie/configuration_ernie.py +24 -2
  272. transformers/models/ernie/modeling_ernie.py +6 -30
  273. transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
  274. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  275. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
  276. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
  277. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
  278. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
  279. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
  280. transformers/models/esm/configuration_esm.py +9 -11
  281. transformers/models/esm/modeling_esm.py +3 -3
  282. transformers/models/esm/modeling_esmfold.py +1 -6
  283. transformers/models/esm/openfold_utils/protein.py +2 -3
  284. transformers/models/evolla/configuration_evolla.py +21 -8
  285. transformers/models/evolla/modeling_evolla.py +11 -7
  286. transformers/models/evolla/modular_evolla.py +5 -1
  287. transformers/models/exaone4/configuration_exaone4.py +8 -5
  288. transformers/models/exaone4/modeling_exaone4.py +4 -4
  289. transformers/models/exaone4/modular_exaone4.py +11 -8
  290. transformers/models/exaone_moe/__init__.py +27 -0
  291. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  292. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  293. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  294. transformers/models/falcon/configuration_falcon.py +9 -1
  295. transformers/models/falcon/modeling_falcon.py +3 -8
  296. transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
  297. transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
  298. transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
  299. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
  300. transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
  301. transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
  302. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
  303. transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
  304. transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
  305. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
  306. transformers/models/flaubert/configuration_flaubert.py +10 -4
  307. transformers/models/flaubert/modeling_flaubert.py +1 -1
  308. transformers/models/flava/configuration_flava.py +4 -3
  309. transformers/models/flava/image_processing_flava_fast.py +4 -4
  310. transformers/models/flava/modeling_flava.py +36 -28
  311. transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
  312. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
  313. transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
  314. transformers/models/florence2/configuration_florence2.py +4 -0
  315. transformers/models/florence2/modeling_florence2.py +57 -32
  316. transformers/models/florence2/modular_florence2.py +48 -26
  317. transformers/models/fnet/configuration_fnet.py +6 -1
  318. transformers/models/focalnet/configuration_focalnet.py +2 -4
  319. transformers/models/focalnet/modeling_focalnet.py +10 -7
  320. transformers/models/fsmt/configuration_fsmt.py +12 -16
  321. transformers/models/funnel/configuration_funnel.py +8 -0
  322. transformers/models/fuyu/configuration_fuyu.py +5 -8
  323. transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
  324. transformers/models/fuyu/modeling_fuyu.py +24 -23
  325. transformers/models/gemma/configuration_gemma.py +5 -7
  326. transformers/models/gemma/modeling_gemma.py +4 -4
  327. transformers/models/gemma/modular_gemma.py +5 -7
  328. transformers/models/gemma2/configuration_gemma2.py +5 -7
  329. transformers/models/gemma2/modeling_gemma2.py +4 -4
  330. transformers/models/gemma2/modular_gemma2.py +8 -10
  331. transformers/models/gemma3/configuration_gemma3.py +28 -22
  332. transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
  333. transformers/models/gemma3/modeling_gemma3.py +37 -33
  334. transformers/models/gemma3/modular_gemma3.py +46 -42
  335. transformers/models/gemma3n/configuration_gemma3n.py +35 -22
  336. transformers/models/gemma3n/modeling_gemma3n.py +86 -58
  337. transformers/models/gemma3n/modular_gemma3n.py +112 -75
  338. transformers/models/git/configuration_git.py +5 -7
  339. transformers/models/git/modeling_git.py +31 -41
  340. transformers/models/glm/configuration_glm.py +7 -9
  341. transformers/models/glm/modeling_glm.py +4 -4
  342. transformers/models/glm4/configuration_glm4.py +7 -9
  343. transformers/models/glm4/modeling_glm4.py +4 -4
  344. transformers/models/glm46v/configuration_glm46v.py +4 -0
  345. transformers/models/glm46v/image_processing_glm46v.py +5 -2
  346. transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
  347. transformers/models/glm46v/modeling_glm46v.py +91 -46
  348. transformers/models/glm46v/modular_glm46v.py +4 -0
  349. transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
  350. transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
  351. transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
  352. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
  353. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
  354. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
  355. transformers/models/glm4v/configuration_glm4v.py +12 -8
  356. transformers/models/glm4v/image_processing_glm4v.py +5 -2
  357. transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
  358. transformers/models/glm4v/modeling_glm4v.py +120 -63
  359. transformers/models/glm4v/modular_glm4v.py +82 -50
  360. transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
  361. transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
  362. transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
  363. transformers/models/glm_image/configuration_glm_image.py +26 -20
  364. transformers/models/glm_image/image_processing_glm_image.py +1 -1
  365. transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
  366. transformers/models/glm_image/modeling_glm_image.py +337 -236
  367. transformers/models/glm_image/modular_glm_image.py +415 -255
  368. transformers/models/glm_image/processing_glm_image.py +65 -17
  369. transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
  370. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  371. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  372. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  373. transformers/models/glmasr/modeling_glmasr.py +34 -28
  374. transformers/models/glmasr/modular_glmasr.py +23 -11
  375. transformers/models/glpn/image_processing_glpn_fast.py +3 -3
  376. transformers/models/glpn/modeling_glpn.py +4 -2
  377. transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
  378. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
  379. transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
  380. transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
  381. transformers/models/gpt2/configuration_gpt2.py +13 -1
  382. transformers/models/gpt2/modeling_gpt2.py +5 -5
  383. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
  384. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
  385. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
  386. transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
  387. transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
  388. transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
  389. transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
  390. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
  391. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
  392. transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
  393. transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
  394. transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
  395. transformers/models/gptj/configuration_gptj.py +4 -4
  396. transformers/models/gptj/modeling_gptj.py +3 -7
  397. transformers/models/granite/configuration_granite.py +5 -7
  398. transformers/models/granite/modeling_granite.py +4 -4
  399. transformers/models/granite_speech/modeling_granite_speech.py +63 -37
  400. transformers/models/granitemoe/configuration_granitemoe.py +5 -7
  401. transformers/models/granitemoe/modeling_granitemoe.py +4 -4
  402. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
  403. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
  404. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
  405. transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
  406. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
  407. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
  408. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
  409. transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
  410. transformers/models/groupvit/configuration_groupvit.py +4 -1
  411. transformers/models/groupvit/modeling_groupvit.py +29 -22
  412. transformers/models/helium/configuration_helium.py +5 -7
  413. transformers/models/helium/modeling_helium.py +4 -4
  414. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
  415. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
  416. transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
  417. transformers/models/hiera/configuration_hiera.py +2 -4
  418. transformers/models/hiera/modeling_hiera.py +11 -8
  419. transformers/models/hubert/configuration_hubert.py +4 -1
  420. transformers/models/hubert/modeling_hubert.py +7 -4
  421. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
  422. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
  423. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
  424. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
  425. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
  426. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
  427. transformers/models/ibert/configuration_ibert.py +4 -1
  428. transformers/models/idefics/configuration_idefics.py +5 -7
  429. transformers/models/idefics/modeling_idefics.py +3 -4
  430. transformers/models/idefics/vision.py +5 -4
  431. transformers/models/idefics2/configuration_idefics2.py +1 -2
  432. transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
  433. transformers/models/idefics2/modeling_idefics2.py +72 -50
  434. transformers/models/idefics3/configuration_idefics3.py +1 -3
  435. transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
  436. transformers/models/idefics3/modeling_idefics3.py +63 -40
  437. transformers/models/ijepa/modeling_ijepa.py +3 -3
  438. transformers/models/imagegpt/configuration_imagegpt.py +9 -1
  439. transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
  440. transformers/models/imagegpt/modeling_imagegpt.py +8 -4
  441. transformers/models/informer/modeling_informer.py +3 -3
  442. transformers/models/instructblip/configuration_instructblip.py +2 -1
  443. transformers/models/instructblip/modeling_instructblip.py +65 -39
  444. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
  445. transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
  446. transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
  447. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
  448. transformers/models/internvl/configuration_internvl.py +5 -0
  449. transformers/models/internvl/modeling_internvl.py +35 -55
  450. transformers/models/internvl/modular_internvl.py +26 -38
  451. transformers/models/internvl/video_processing_internvl.py +2 -2
  452. transformers/models/jais2/configuration_jais2.py +5 -7
  453. transformers/models/jais2/modeling_jais2.py +4 -4
  454. transformers/models/jamba/configuration_jamba.py +5 -7
  455. transformers/models/jamba/modeling_jamba.py +4 -4
  456. transformers/models/jamba/modular_jamba.py +3 -3
  457. transformers/models/janus/image_processing_janus.py +2 -2
  458. transformers/models/janus/image_processing_janus_fast.py +8 -8
  459. transformers/models/janus/modeling_janus.py +63 -146
  460. transformers/models/janus/modular_janus.py +62 -20
  461. transformers/models/jetmoe/configuration_jetmoe.py +6 -4
  462. transformers/models/jetmoe/modeling_jetmoe.py +3 -3
  463. transformers/models/jetmoe/modular_jetmoe.py +3 -3
  464. transformers/models/kosmos2/configuration_kosmos2.py +10 -8
  465. transformers/models/kosmos2/modeling_kosmos2.py +56 -34
  466. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
  467. transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
  468. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
  469. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
  470. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
  471. transformers/models/lasr/configuration_lasr.py +2 -4
  472. transformers/models/lasr/modeling_lasr.py +3 -3
  473. transformers/models/lasr/modular_lasr.py +3 -3
  474. transformers/models/layoutlm/configuration_layoutlm.py +14 -1
  475. transformers/models/layoutlm/modeling_layoutlm.py +3 -3
  476. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
  477. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
  478. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
  479. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
  480. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
  481. transformers/models/led/configuration_led.py +7 -8
  482. transformers/models/levit/image_processing_levit_fast.py +4 -4
  483. transformers/models/lfm2/configuration_lfm2.py +5 -7
  484. transformers/models/lfm2/modeling_lfm2.py +4 -4
  485. transformers/models/lfm2/modular_lfm2.py +3 -3
  486. transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
  487. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
  488. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  489. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
  490. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
  491. transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
  492. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
  493. transformers/models/lightglue/modeling_lightglue.py +3 -3
  494. transformers/models/lightglue/modular_lightglue.py +3 -3
  495. transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
  496. transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
  497. transformers/models/lilt/configuration_lilt.py +6 -1
  498. transformers/models/llama/configuration_llama.py +5 -7
  499. transformers/models/llama/modeling_llama.py +4 -4
  500. transformers/models/llama4/configuration_llama4.py +67 -47
  501. transformers/models/llama4/image_processing_llama4_fast.py +3 -3
  502. transformers/models/llama4/modeling_llama4.py +46 -44
  503. transformers/models/llava/configuration_llava.py +10 -0
  504. transformers/models/llava/image_processing_llava_fast.py +3 -3
  505. transformers/models/llava/modeling_llava.py +38 -65
  506. transformers/models/llava_next/configuration_llava_next.py +2 -1
  507. transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
  508. transformers/models/llava_next/modeling_llava_next.py +61 -60
  509. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
  510. transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
  511. transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
  512. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
  513. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
  514. transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
  515. transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
  516. transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
  517. transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
  518. transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
  519. transformers/models/longformer/configuration_longformer.py +4 -1
  520. transformers/models/longt5/configuration_longt5.py +9 -6
  521. transformers/models/longt5/modeling_longt5.py +2 -1
  522. transformers/models/luke/configuration_luke.py +8 -1
  523. transformers/models/lw_detr/configuration_lw_detr.py +19 -31
  524. transformers/models/lw_detr/modeling_lw_detr.py +43 -44
  525. transformers/models/lw_detr/modular_lw_detr.py +36 -38
  526. transformers/models/lxmert/configuration_lxmert.py +16 -0
  527. transformers/models/m2m_100/configuration_m2m_100.py +7 -8
  528. transformers/models/m2m_100/modeling_m2m_100.py +3 -3
  529. transformers/models/mamba/configuration_mamba.py +5 -2
  530. transformers/models/mamba/modeling_mamba.py +18 -26
  531. transformers/models/mamba2/configuration_mamba2.py +5 -7
  532. transformers/models/mamba2/modeling_mamba2.py +22 -33
  533. transformers/models/marian/configuration_marian.py +10 -4
  534. transformers/models/marian/modeling_marian.py +4 -4
  535. transformers/models/markuplm/configuration_markuplm.py +4 -6
  536. transformers/models/markuplm/modeling_markuplm.py +3 -3
  537. transformers/models/mask2former/configuration_mask2former.py +12 -47
  538. transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
  539. transformers/models/mask2former/modeling_mask2former.py +18 -12
  540. transformers/models/maskformer/configuration_maskformer.py +14 -45
  541. transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
  542. transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
  543. transformers/models/maskformer/modeling_maskformer.py +15 -9
  544. transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
  545. transformers/models/mbart/configuration_mbart.py +9 -4
  546. transformers/models/mbart/modeling_mbart.py +9 -6
  547. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
  548. transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
  549. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  550. transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
  551. transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
  552. transformers/models/mgp_str/modeling_mgp_str.py +4 -2
  553. transformers/models/mimi/configuration_mimi.py +4 -0
  554. transformers/models/mimi/modeling_mimi.py +40 -36
  555. transformers/models/minimax/configuration_minimax.py +8 -11
  556. transformers/models/minimax/modeling_minimax.py +5 -5
  557. transformers/models/minimax/modular_minimax.py +9 -12
  558. transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
  559. transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
  560. transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
  561. transformers/models/ministral/configuration_ministral.py +5 -7
  562. transformers/models/ministral/modeling_ministral.py +4 -4
  563. transformers/models/ministral/modular_ministral.py +5 -8
  564. transformers/models/ministral3/configuration_ministral3.py +4 -4
  565. transformers/models/ministral3/modeling_ministral3.py +4 -4
  566. transformers/models/ministral3/modular_ministral3.py +3 -3
  567. transformers/models/mistral/configuration_mistral.py +5 -7
  568. transformers/models/mistral/modeling_mistral.py +4 -4
  569. transformers/models/mistral/modular_mistral.py +3 -3
  570. transformers/models/mistral3/configuration_mistral3.py +4 -0
  571. transformers/models/mistral3/modeling_mistral3.py +36 -40
  572. transformers/models/mistral3/modular_mistral3.py +31 -32
  573. transformers/models/mixtral/configuration_mixtral.py +8 -11
  574. transformers/models/mixtral/modeling_mixtral.py +4 -4
  575. transformers/models/mlcd/modeling_mlcd.py +7 -5
  576. transformers/models/mlcd/modular_mlcd.py +7 -5
  577. transformers/models/mllama/configuration_mllama.py +5 -7
  578. transformers/models/mllama/image_processing_mllama_fast.py +6 -5
  579. transformers/models/mllama/modeling_mllama.py +19 -19
  580. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
  581. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
  582. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
  583. transformers/models/mobilebert/configuration_mobilebert.py +4 -1
  584. transformers/models/mobilebert/modeling_mobilebert.py +3 -3
  585. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
  586. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
  587. transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
  588. transformers/models/mobilevit/modeling_mobilevit.py +4 -2
  589. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
  590. transformers/models/modernbert/configuration_modernbert.py +46 -21
  591. transformers/models/modernbert/modeling_modernbert.py +146 -899
  592. transformers/models/modernbert/modular_modernbert.py +185 -908
  593. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
  594. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
  595. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
  596. transformers/models/moonshine/configuration_moonshine.py +12 -7
  597. transformers/models/moonshine/modeling_moonshine.py +7 -7
  598. transformers/models/moonshine/modular_moonshine.py +19 -13
  599. transformers/models/moshi/configuration_moshi.py +28 -2
  600. transformers/models/moshi/modeling_moshi.py +4 -9
  601. transformers/models/mpnet/configuration_mpnet.py +6 -1
  602. transformers/models/mpt/configuration_mpt.py +16 -0
  603. transformers/models/mra/configuration_mra.py +8 -1
  604. transformers/models/mt5/configuration_mt5.py +9 -5
  605. transformers/models/mt5/modeling_mt5.py +5 -8
  606. transformers/models/musicgen/configuration_musicgen.py +12 -7
  607. transformers/models/musicgen/modeling_musicgen.py +6 -5
  608. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
  609. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
  610. transformers/models/mvp/configuration_mvp.py +8 -4
  611. transformers/models/mvp/modeling_mvp.py +6 -4
  612. transformers/models/nanochat/configuration_nanochat.py +5 -7
  613. transformers/models/nanochat/modeling_nanochat.py +4 -4
  614. transformers/models/nanochat/modular_nanochat.py +4 -4
  615. transformers/models/nemotron/configuration_nemotron.py +5 -7
  616. transformers/models/nemotron/modeling_nemotron.py +4 -14
  617. transformers/models/nllb/tokenization_nllb.py +7 -5
  618. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
  619. transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
  620. transformers/models/nougat/image_processing_nougat_fast.py +8 -8
  621. transformers/models/nystromformer/configuration_nystromformer.py +8 -1
  622. transformers/models/olmo/configuration_olmo.py +5 -7
  623. transformers/models/olmo/modeling_olmo.py +4 -4
  624. transformers/models/olmo/modular_olmo.py +3 -3
  625. transformers/models/olmo2/configuration_olmo2.py +9 -11
  626. transformers/models/olmo2/modeling_olmo2.py +4 -4
  627. transformers/models/olmo2/modular_olmo2.py +7 -7
  628. transformers/models/olmo3/configuration_olmo3.py +10 -11
  629. transformers/models/olmo3/modeling_olmo3.py +4 -4
  630. transformers/models/olmo3/modular_olmo3.py +13 -14
  631. transformers/models/olmoe/configuration_olmoe.py +5 -7
  632. transformers/models/olmoe/modeling_olmoe.py +4 -4
  633. transformers/models/olmoe/modular_olmoe.py +3 -3
  634. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
  635. transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
  636. transformers/models/oneformer/configuration_oneformer.py +9 -46
  637. transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
  638. transformers/models/oneformer/modeling_oneformer.py +14 -9
  639. transformers/models/openai/configuration_openai.py +16 -0
  640. transformers/models/opt/configuration_opt.py +6 -6
  641. transformers/models/opt/modeling_opt.py +5 -5
  642. transformers/models/ovis2/configuration_ovis2.py +4 -0
  643. transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
  644. transformers/models/ovis2/modeling_ovis2.py +58 -99
  645. transformers/models/ovis2/modular_ovis2.py +52 -13
  646. transformers/models/owlv2/configuration_owlv2.py +4 -1
  647. transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
  648. transformers/models/owlv2/modeling_owlv2.py +40 -27
  649. transformers/models/owlv2/modular_owlv2.py +5 -5
  650. transformers/models/owlvit/configuration_owlvit.py +4 -1
  651. transformers/models/owlvit/modeling_owlvit.py +40 -27
  652. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
  653. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
  654. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
  655. transformers/models/paligemma/configuration_paligemma.py +4 -0
  656. transformers/models/paligemma/modeling_paligemma.py +30 -26
  657. transformers/models/parakeet/configuration_parakeet.py +2 -4
  658. transformers/models/parakeet/modeling_parakeet.py +3 -3
  659. transformers/models/parakeet/modular_parakeet.py +3 -3
  660. transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
  661. transformers/models/patchtst/modeling_patchtst.py +3 -3
  662. transformers/models/pe_audio/modeling_pe_audio.py +4 -4
  663. transformers/models/pe_audio/modular_pe_audio.py +1 -1
  664. transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
  665. transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
  666. transformers/models/pe_video/modeling_pe_video.py +36 -24
  667. transformers/models/pe_video/modular_pe_video.py +36 -23
  668. transformers/models/pegasus/configuration_pegasus.py +8 -5
  669. transformers/models/pegasus/modeling_pegasus.py +4 -4
  670. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
  671. transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
  672. transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
  673. transformers/models/perceiver/modeling_perceiver.py +17 -9
  674. transformers/models/perception_lm/modeling_perception_lm.py +26 -27
  675. transformers/models/perception_lm/modular_perception_lm.py +27 -25
  676. transformers/models/persimmon/configuration_persimmon.py +5 -7
  677. transformers/models/persimmon/modeling_persimmon.py +5 -5
  678. transformers/models/phi/configuration_phi.py +8 -6
  679. transformers/models/phi/modeling_phi.py +4 -4
  680. transformers/models/phi/modular_phi.py +3 -3
  681. transformers/models/phi3/configuration_phi3.py +9 -11
  682. transformers/models/phi3/modeling_phi3.py +4 -4
  683. transformers/models/phi3/modular_phi3.py +3 -3
  684. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
  685. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
  686. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
  687. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
  688. transformers/models/phimoe/configuration_phimoe.py +5 -7
  689. transformers/models/phimoe/modeling_phimoe.py +15 -39
  690. transformers/models/phimoe/modular_phimoe.py +12 -7
  691. transformers/models/pix2struct/configuration_pix2struct.py +12 -9
  692. transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
  693. transformers/models/pix2struct/modeling_pix2struct.py +14 -7
  694. transformers/models/pixio/configuration_pixio.py +2 -4
  695. transformers/models/pixio/modeling_pixio.py +9 -8
  696. transformers/models/pixio/modular_pixio.py +4 -2
  697. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
  698. transformers/models/pixtral/modeling_pixtral.py +9 -12
  699. transformers/models/plbart/configuration_plbart.py +8 -5
  700. transformers/models/plbart/modeling_plbart.py +9 -7
  701. transformers/models/plbart/modular_plbart.py +1 -1
  702. transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
  703. transformers/models/pop2piano/configuration_pop2piano.py +7 -6
  704. transformers/models/pop2piano/modeling_pop2piano.py +2 -1
  705. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  706. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  707. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  708. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  709. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  710. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
  711. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
  712. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
  713. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
  714. transformers/models/prophetnet/configuration_prophetnet.py +11 -10
  715. transformers/models/prophetnet/modeling_prophetnet.py +12 -23
  716. transformers/models/pvt/image_processing_pvt.py +7 -7
  717. transformers/models/pvt/image_processing_pvt_fast.py +1 -1
  718. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  719. transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
  720. transformers/models/qwen2/configuration_qwen2.py +14 -4
  721. transformers/models/qwen2/modeling_qwen2.py +4 -4
  722. transformers/models/qwen2/modular_qwen2.py +3 -3
  723. transformers/models/qwen2/tokenization_qwen2.py +0 -4
  724. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
  725. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
  726. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
  727. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
  728. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
  729. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
  730. transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
  731. transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
  732. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  733. transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
  734. transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
  735. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
  736. transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
  737. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
  738. transformers/models/qwen3/configuration_qwen3.py +15 -5
  739. transformers/models/qwen3/modeling_qwen3.py +4 -4
  740. transformers/models/qwen3/modular_qwen3.py +3 -3
  741. transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
  742. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  743. transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
  744. transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
  745. transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
  746. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
  747. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
  748. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
  749. transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
  750. transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
  751. transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
  752. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
  753. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
  754. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
  755. transformers/models/rag/configuration_rag.py +6 -6
  756. transformers/models/rag/modeling_rag.py +3 -3
  757. transformers/models/rag/retrieval_rag.py +1 -1
  758. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
  759. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
  760. transformers/models/reformer/configuration_reformer.py +7 -7
  761. transformers/models/rembert/configuration_rembert.py +8 -1
  762. transformers/models/rembert/modeling_rembert.py +0 -22
  763. transformers/models/resnet/configuration_resnet.py +2 -4
  764. transformers/models/resnet/modeling_resnet.py +6 -5
  765. transformers/models/roberta/configuration_roberta.py +11 -2
  766. transformers/models/roberta/modeling_roberta.py +6 -6
  767. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
  768. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
  769. transformers/models/roc_bert/configuration_roc_bert.py +8 -1
  770. transformers/models/roc_bert/modeling_roc_bert.py +6 -41
  771. transformers/models/roformer/configuration_roformer.py +13 -2
  772. transformers/models/roformer/modeling_roformer.py +0 -14
  773. transformers/models/rt_detr/configuration_rt_detr.py +8 -49
  774. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
  775. transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
  776. transformers/models/rt_detr/modeling_rt_detr.py +578 -737
  777. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
  778. transformers/models/rt_detr/modular_rt_detr.py +1508 -6
  779. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
  780. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
  781. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
  782. transformers/models/rwkv/configuration_rwkv.py +2 -3
  783. transformers/models/rwkv/modeling_rwkv.py +0 -23
  784. transformers/models/sam/configuration_sam.py +2 -0
  785. transformers/models/sam/image_processing_sam_fast.py +4 -4
  786. transformers/models/sam/modeling_sam.py +13 -8
  787. transformers/models/sam/processing_sam.py +3 -3
  788. transformers/models/sam2/configuration_sam2.py +1 -1
  789. transformers/models/sam2/modeling_sam2.py +56 -52
  790. transformers/models/sam2/modular_sam2.py +47 -55
  791. transformers/models/sam2_video/modeling_sam2_video.py +50 -51
  792. transformers/models/sam2_video/modular_sam2_video.py +12 -10
  793. transformers/models/sam3/modeling_sam3.py +43 -47
  794. transformers/models/sam3/processing_sam3.py +8 -4
  795. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
  796. transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
  797. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  798. transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
  799. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
  800. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
  801. transformers/models/sam3_video/modeling_sam3_video.py +27 -14
  802. transformers/models/sam_hq/configuration_sam_hq.py +2 -0
  803. transformers/models/sam_hq/modeling_sam_hq.py +13 -9
  804. transformers/models/sam_hq/modular_sam_hq.py +6 -6
  805. transformers/models/sam_hq/processing_sam_hq.py +7 -6
  806. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
  807. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
  808. transformers/models/seed_oss/configuration_seed_oss.py +7 -9
  809. transformers/models/seed_oss/modeling_seed_oss.py +4 -4
  810. transformers/models/seed_oss/modular_seed_oss.py +3 -3
  811. transformers/models/segformer/image_processing_segformer_fast.py +4 -4
  812. transformers/models/segformer/modeling_segformer.py +4 -2
  813. transformers/models/segformer/modular_segformer.py +3 -3
  814. transformers/models/seggpt/modeling_seggpt.py +20 -8
  815. transformers/models/sew/configuration_sew.py +4 -1
  816. transformers/models/sew/modeling_sew.py +9 -5
  817. transformers/models/sew/modular_sew.py +2 -1
  818. transformers/models/sew_d/configuration_sew_d.py +4 -1
  819. transformers/models/sew_d/modeling_sew_d.py +4 -1
  820. transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
  821. transformers/models/siglip/configuration_siglip.py +4 -1
  822. transformers/models/siglip/modeling_siglip.py +27 -71
  823. transformers/models/siglip2/__init__.py +1 -0
  824. transformers/models/siglip2/configuration_siglip2.py +4 -2
  825. transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
  826. transformers/models/siglip2/modeling_siglip2.py +37 -78
  827. transformers/models/siglip2/modular_siglip2.py +74 -25
  828. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  829. transformers/models/smollm3/configuration_smollm3.py +6 -6
  830. transformers/models/smollm3/modeling_smollm3.py +4 -4
  831. transformers/models/smollm3/modular_smollm3.py +9 -9
  832. transformers/models/smolvlm/configuration_smolvlm.py +1 -3
  833. transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
  834. transformers/models/smolvlm/modeling_smolvlm.py +75 -46
  835. transformers/models/smolvlm/modular_smolvlm.py +36 -23
  836. transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
  837. transformers/models/solar_open/__init__.py +27 -0
  838. transformers/models/solar_open/configuration_solar_open.py +184 -0
  839. transformers/models/solar_open/modeling_solar_open.py +642 -0
  840. transformers/models/solar_open/modular_solar_open.py +224 -0
  841. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
  842. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
  843. transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
  844. transformers/models/speecht5/configuration_speecht5.py +7 -8
  845. transformers/models/splinter/configuration_splinter.py +6 -6
  846. transformers/models/splinter/modeling_splinter.py +8 -3
  847. transformers/models/squeezebert/configuration_squeezebert.py +14 -1
  848. transformers/models/stablelm/configuration_stablelm.py +8 -6
  849. transformers/models/stablelm/modeling_stablelm.py +5 -5
  850. transformers/models/starcoder2/configuration_starcoder2.py +11 -5
  851. transformers/models/starcoder2/modeling_starcoder2.py +5 -5
  852. transformers/models/starcoder2/modular_starcoder2.py +4 -4
  853. transformers/models/superglue/configuration_superglue.py +4 -0
  854. transformers/models/superglue/image_processing_superglue_fast.py +4 -3
  855. transformers/models/superglue/modeling_superglue.py +9 -4
  856. transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
  857. transformers/models/superpoint/modeling_superpoint.py +4 -2
  858. transformers/models/swin/configuration_swin.py +2 -4
  859. transformers/models/swin/modeling_swin.py +11 -8
  860. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
  861. transformers/models/swin2sr/modeling_swin2sr.py +4 -2
  862. transformers/models/swinv2/configuration_swinv2.py +2 -4
  863. transformers/models/swinv2/modeling_swinv2.py +10 -7
  864. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
  865. transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
  866. transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
  867. transformers/models/t5/configuration_t5.py +9 -8
  868. transformers/models/t5/modeling_t5.py +5 -8
  869. transformers/models/t5gemma/configuration_t5gemma.py +10 -25
  870. transformers/models/t5gemma/modeling_t5gemma.py +9 -9
  871. transformers/models/t5gemma/modular_t5gemma.py +11 -24
  872. transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
  873. transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
  874. transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
  875. transformers/models/table_transformer/configuration_table_transformer.py +18 -49
  876. transformers/models/table_transformer/modeling_table_transformer.py +27 -53
  877. transformers/models/tapas/configuration_tapas.py +12 -1
  878. transformers/models/tapas/modeling_tapas.py +1 -1
  879. transformers/models/tapas/tokenization_tapas.py +1 -0
  880. transformers/models/textnet/configuration_textnet.py +4 -6
  881. transformers/models/textnet/image_processing_textnet_fast.py +3 -3
  882. transformers/models/textnet/modeling_textnet.py +15 -14
  883. transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
  884. transformers/models/timesfm/modeling_timesfm.py +5 -6
  885. transformers/models/timesfm/modular_timesfm.py +5 -6
  886. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
  887. transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
  888. transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
  889. transformers/models/trocr/configuration_trocr.py +11 -7
  890. transformers/models/trocr/modeling_trocr.py +4 -2
  891. transformers/models/tvp/configuration_tvp.py +10 -35
  892. transformers/models/tvp/image_processing_tvp_fast.py +6 -5
  893. transformers/models/tvp/modeling_tvp.py +1 -1
  894. transformers/models/udop/configuration_udop.py +16 -7
  895. transformers/models/udop/modeling_udop.py +10 -6
  896. transformers/models/umt5/configuration_umt5.py +8 -6
  897. transformers/models/umt5/modeling_umt5.py +7 -3
  898. transformers/models/unispeech/configuration_unispeech.py +4 -1
  899. transformers/models/unispeech/modeling_unispeech.py +7 -4
  900. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
  901. transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
  902. transformers/models/upernet/configuration_upernet.py +8 -35
  903. transformers/models/upernet/modeling_upernet.py +1 -1
  904. transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
  905. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  906. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  907. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
  908. transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
  909. transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
  910. transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
  911. transformers/models/video_llava/configuration_video_llava.py +4 -0
  912. transformers/models/video_llava/modeling_video_llava.py +87 -89
  913. transformers/models/videomae/modeling_videomae.py +4 -5
  914. transformers/models/vilt/configuration_vilt.py +4 -1
  915. transformers/models/vilt/image_processing_vilt_fast.py +6 -6
  916. transformers/models/vilt/modeling_vilt.py +27 -12
  917. transformers/models/vipllava/configuration_vipllava.py +4 -0
  918. transformers/models/vipllava/modeling_vipllava.py +57 -31
  919. transformers/models/vipllava/modular_vipllava.py +50 -24
  920. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
  921. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
  922. transformers/models/visual_bert/configuration_visual_bert.py +6 -1
  923. transformers/models/vit/configuration_vit.py +2 -2
  924. transformers/models/vit/modeling_vit.py +7 -5
  925. transformers/models/vit_mae/modeling_vit_mae.py +11 -7
  926. transformers/models/vit_msn/modeling_vit_msn.py +11 -7
  927. transformers/models/vitdet/configuration_vitdet.py +2 -4
  928. transformers/models/vitdet/modeling_vitdet.py +2 -3
  929. transformers/models/vitmatte/configuration_vitmatte.py +6 -35
  930. transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
  931. transformers/models/vitmatte/modeling_vitmatte.py +1 -1
  932. transformers/models/vitpose/configuration_vitpose.py +6 -43
  933. transformers/models/vitpose/modeling_vitpose.py +5 -3
  934. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
  935. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
  936. transformers/models/vits/configuration_vits.py +4 -0
  937. transformers/models/vits/modeling_vits.py +9 -7
  938. transformers/models/vivit/modeling_vivit.py +4 -4
  939. transformers/models/vjepa2/modeling_vjepa2.py +9 -9
  940. transformers/models/voxtral/configuration_voxtral.py +0 -1
  941. transformers/models/voxtral/modeling_voxtral.py +25 -24
  942. transformers/models/voxtral/modular_voxtral.py +26 -20
  943. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
  944. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
  945. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
  946. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
  947. transformers/models/wavlm/configuration_wavlm.py +4 -1
  948. transformers/models/wavlm/modeling_wavlm.py +4 -1
  949. transformers/models/whisper/configuration_whisper.py +6 -4
  950. transformers/models/whisper/generation_whisper.py +0 -1
  951. transformers/models/whisper/modeling_whisper.py +3 -3
  952. transformers/models/x_clip/configuration_x_clip.py +4 -1
  953. transformers/models/x_clip/modeling_x_clip.py +26 -27
  954. transformers/models/xglm/configuration_xglm.py +9 -7
  955. transformers/models/xlm/configuration_xlm.py +10 -7
  956. transformers/models/xlm/modeling_xlm.py +1 -1
  957. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
  958. transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
  959. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
  960. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
  961. transformers/models/xlnet/configuration_xlnet.py +3 -1
  962. transformers/models/xlstm/configuration_xlstm.py +5 -7
  963. transformers/models/xlstm/modeling_xlstm.py +0 -32
  964. transformers/models/xmod/configuration_xmod.py +11 -2
  965. transformers/models/xmod/modeling_xmod.py +13 -16
  966. transformers/models/yolos/image_processing_yolos_fast.py +25 -28
  967. transformers/models/yolos/modeling_yolos.py +7 -7
  968. transformers/models/yolos/modular_yolos.py +16 -16
  969. transformers/models/yoso/configuration_yoso.py +8 -1
  970. transformers/models/youtu/__init__.py +27 -0
  971. transformers/models/youtu/configuration_youtu.py +194 -0
  972. transformers/models/youtu/modeling_youtu.py +619 -0
  973. transformers/models/youtu/modular_youtu.py +254 -0
  974. transformers/models/zamba/configuration_zamba.py +5 -7
  975. transformers/models/zamba/modeling_zamba.py +25 -56
  976. transformers/models/zamba2/configuration_zamba2.py +8 -13
  977. transformers/models/zamba2/modeling_zamba2.py +53 -78
  978. transformers/models/zamba2/modular_zamba2.py +36 -29
  979. transformers/models/zoedepth/configuration_zoedepth.py +17 -40
  980. transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
  981. transformers/models/zoedepth/modeling_zoedepth.py +5 -3
  982. transformers/pipelines/__init__.py +1 -61
  983. transformers/pipelines/any_to_any.py +1 -1
  984. transformers/pipelines/automatic_speech_recognition.py +0 -2
  985. transformers/pipelines/base.py +1 -1
  986. transformers/pipelines/image_text_to_text.py +1 -1
  987. transformers/pipelines/text_to_audio.py +5 -1
  988. transformers/processing_utils.py +35 -44
  989. transformers/pytorch_utils.py +2 -26
  990. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  991. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  992. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  993. transformers/quantizers/quantizer_mxfp4.py +1 -1
  994. transformers/quantizers/quantizer_torchao.py +0 -16
  995. transformers/safetensors_conversion.py +11 -4
  996. transformers/testing_utils.py +3 -28
  997. transformers/tokenization_mistral_common.py +9 -0
  998. transformers/tokenization_python.py +6 -4
  999. transformers/tokenization_utils_base.py +119 -219
  1000. transformers/tokenization_utils_tokenizers.py +31 -2
  1001. transformers/trainer.py +25 -33
  1002. transformers/trainer_seq2seq.py +1 -1
  1003. transformers/training_args.py +411 -417
  1004. transformers/utils/__init__.py +1 -4
  1005. transformers/utils/auto_docstring.py +15 -18
  1006. transformers/utils/backbone_utils.py +13 -373
  1007. transformers/utils/doc.py +4 -36
  1008. transformers/utils/generic.py +69 -33
  1009. transformers/utils/import_utils.py +72 -75
  1010. transformers/utils/loading_report.py +133 -105
  1011. transformers/utils/quantization_config.py +0 -21
  1012. transformers/video_processing_utils.py +5 -5
  1013. transformers/video_utils.py +3 -1
  1014. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
  1015. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
  1016. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1017. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1018. transformers/pipelines/image_to_text.py +0 -189
  1019. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1020. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1021. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1549 @@
1
+ # Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from dataclasses import dataclass
17
+ from typing import Optional
18
+
19
+ import numpy as np
20
+ import torch
21
+ import torch.nn.functional as F
22
+ import torchvision.transforms.v2.functional as tvF
23
+ from torch import nn
24
+
25
+ from ... import initialization as init
26
+ from ...backbone_utils import consolidate_backbone_kwargs_to_config
27
+ from ...configuration_utils import PreTrainedConfig
28
+ from ...image_processing_utils_fast import (
29
+ BaseImageProcessorFast,
30
+ BatchFeature,
31
+ )
32
+ from ...image_transforms import (
33
+ group_images_by_shape,
34
+ reorder_images,
35
+ )
36
+ from ...image_utils import PILImageResampling, SizeDict
37
+ from ...modeling_outputs import BaseModelOutput
38
+ from ...processing_utils import Unpack
39
+ from ...utils import (
40
+ ModelOutput,
41
+ TransformersKwargs,
42
+ auto_docstring,
43
+ is_cv2_available,
44
+ logging,
45
+ requires_backends,
46
+ )
47
+ from ...utils.generic import TensorType, can_return_tuple
48
+ from ..auto import AutoConfig
49
+ from ..resnet.modeling_resnet import ResNetConvLayer
50
+ from ..rt_detr.modeling_rt_detr import (
51
+ RTDetrDecoder,
52
+ RTDetrDecoderOutput,
53
+ RTDetrForObjectDetection,
54
+ RTDetrHybridEncoder,
55
+ RTDetrMLPPredictionHead,
56
+ RTDetrModel,
57
+ RTDetrModelOutput,
58
+ RTDetrMultiscaleDeformableAttention,
59
+ RTDetrPreTrainedModel,
60
+ get_contrastive_denoising_training_group,
61
+ inverse_sigmoid,
62
+ )
63
+
64
+
65
+ if is_cv2_available():
66
+ import cv2
67
+
68
+
69
+ logger = logging.get_logger(__name__)
70
+
71
+
72
+ class PPDocLayoutV3Config(PreTrainedConfig):
73
+ r"""
74
+ This is the configuration class to store the configuration of a [`PP-DocLayoutV3`]. It is used to instantiate a
75
+ PP-DocLayoutV3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
76
+ with the defaults will yield a similar configuration to that of the PP-DocLayoutV3
77
+ [PaddlePaddle/PP-DocLayoutV3_safetensors](https://huggingface.co/PaddlePaddle/PP-DocLayoutV3_safetensors) architecture.
78
+
79
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
80
+ documentation from [`PreTrainedConfig`] for more information.
81
+
82
+ Args:
83
+ initializer_range (`float`, *optional*, defaults to 0.01):
84
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
85
+ initializer_bias_prior_prob (`float`, *optional*):
86
+ The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`.
87
+ If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights.
88
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
89
+ The epsilon used by the layer normalization layers.
90
+ batch_norm_eps (`float`, *optional*, defaults to 1e-05):
91
+ The epsilon used by the batch normalization layers.
92
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
93
+ Whether the model's input and output word embeddings should be tied.
94
+ backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*):
95
+ The configuration of the backbone model.
96
+ freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`):
97
+ Whether to freeze the batch normalization layers in the backbone.
98
+ encoder_hidden_dim (`int`, *optional*, defaults to 256):
99
+ Dimension of the layers in hybrid encoder.
100
+ encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`):
101
+ Multi level features input for encoder.
102
+ feat_strides (`list[int]`, *optional*, defaults to `[8, 16, 32]`):
103
+ Strides used in each feature map.
104
+ encoder_layers (`int`, *optional*, defaults to 1):
105
+ Total of layers to be used by the encoder.
106
+ encoder_ffn_dim (`int`, *optional*, defaults to 1024):
107
+ Dimension of the "intermediate" (often named feed-forward) layer in decoder.
108
+ encoder_attention_heads (`int`, *optional*, defaults to 8):
109
+ Number of attention heads for each attention layer in the Transformer encoder.
110
+ dropout (`float`, *optional*, defaults to 0.0):
111
+ The ratio for all dropout layers.
112
+ activation_dropout (`float`, *optional*, defaults to 0.0):
113
+ The dropout ratio for activations inside the fully connected layer.
114
+ encode_proj_layers (`list[int]`, *optional*, defaults to `[2]`):
115
+ Indexes of the projected layers to be used in the encoder.
116
+ positional_encoding_temperature (`int`, *optional*, defaults to 10000):
117
+ The temperature parameter used to create the positional encodings.
118
+ encoder_activation_function (`str`, *optional*, defaults to `"gelu"`):
119
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
120
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
121
+ activation_function (`str`, *optional*, defaults to `"silu"`):
122
+ The non-linear activation function (function or string) in the general layer. If string, `"gelu"`,
123
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
124
+ eval_size (`tuple[int, int]`, *optional*):
125
+ Height and width used to computes the effective height and width of the position embeddings after taking
126
+ into account the stride.
127
+ normalize_before (`bool`, *optional*, defaults to `False`):
128
+ Determine whether to apply layer normalization in the transformer encoder layer before self-attention and
129
+ feed-forward modules.
130
+ hidden_expansion (`float`, *optional*, defaults to 1.0):
131
+ Expansion ratio to enlarge the dimension size of RepVGGBlock and CSPRepLayer.
132
+ mask_feature_channels (`list[int]`, *optional*, defaults to `[64, 64]`):
133
+ The channels of the multi-level features for mask enhancement.
134
+ x4_feat_dim (`int`, *optional*, defaults to 128):
135
+ The dimension of the x4 feature map.
136
+ d_model (`int`, *optional*, defaults to 256):
137
+ Dimension of the layers exclude hybrid encoder.
138
+ num_prototypes (`int`, *optional*, defaults to 32):
139
+ Dimension of the layers exclude mask query head.
140
+ label_noise_ratio (`float`, *optional*, defaults to 0.4):
141
+ The fraction of denoising labels to which random noise should be added.
142
+ box_noise_scale (`float`, *optional*, defaults to 0.4):
143
+ Scale or magnitude of noise to be added to the bounding boxes.
144
+ mask_enhanced (`bool`, *optional*, defaults to `True`):
145
+ Whether to use enhanced masked attention.
146
+ num_queries (`int`, *optional*, defaults to 300):
147
+ Number of object queries.
148
+ decoder_in_channels (`list`, *optional*, defaults to `[256, 256, 256]`):
149
+ Multi level features dimension for decoder
150
+ decoder_ffn_dim (`int`, *optional*, defaults to 1024):
151
+ Dimension of the "intermediate" (often named feed-forward) layer in decoder.
152
+ num_feature_levels (`int`, *optional*, defaults to 3):
153
+ The number of input feature levels.
154
+ decoder_n_points (`int`, *optional*, defaults to 4):
155
+ The number of sampled keys in each feature level for each attention head in the decoder.
156
+ decoder_layers (`int`, *optional*, defaults to 6):
157
+ Number of decoder layers.
158
+ decoder_attention_heads (`int`, *optional*, defaults to 8):
159
+ Number of attention heads for each attention layer in the Transformer decoder.
160
+ decoder_activation_function (`str`, *optional*, defaults to `"relu"`):
161
+ The non-linear activation function (function or string) in the decoder. If string, `"gelu"`,
162
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
163
+ attention_dropout (`float`, *optional*, defaults to 0.0):
164
+ The dropout ratio for the attention probabilities.
165
+ num_denoising (`int`, *optional*, defaults to 100):
166
+ The total number of denoising tasks or queries to be used for contrastive denoising.
167
+ learn_initial_query (`bool`, *optional*, defaults to `False`):
168
+ Indicates whether the initial query embeddings for the decoder should be learned during training
169
+ anchor_image_size (`tuple[int, int]`, *optional*):
170
+ Height and width of the input image used during evaluation to generate the bounding box anchors. If None, automatic generate anchor is applied.
171
+ disable_custom_kernels (`bool`, *optional*, defaults to `True`):
172
+ Whether to disable custom kernels.
173
+ is_encoder_decoder (`bool`, *optional*, defaults to `True`):
174
+ Whether the architecture has an encoder decoder structure.
175
+ global_pointer_head_size (`int`, *optional*, defaults to 64):
176
+ The size of the global pointer head.
177
+ gp_dropout_value (`float`, *optional*, defaults to 0.1):
178
+ The dropout probability in the global pointer head.
179
+ Examples:
180
+
181
+ ```python
182
+ >>> from transformers import PPDocLayoutV3Config, PPDocLayoutV3ForObjectDetection
183
+
184
+ >>> # Initializing a PP-DocLayoutV3 configuration
185
+ >>> configuration = PPDocLayoutV3Config()
186
+
187
+ >>> # Initializing a model (with random weights) from the configuration
188
+ >>> model = PPDocLayoutV3ForObjectDetection(configuration)
189
+
190
+ >>> # Accessing the model configuration
191
+ >>> configuration = model.config
192
+ ```"""
193
+
194
+ model_type = "pp_doclayout_v3"
195
+ sub_configs = {"backbone_config": AutoConfig}
196
+
197
+ layer_types = ("basic", "bottleneck")
198
+ attribute_map = {
199
+ "hidden_size": "d_model",
200
+ "num_attention_heads": "encoder_attention_heads",
201
+ }
202
+
203
+ def __init__(
204
+ self,
205
+ initializer_range=0.01,
206
+ initializer_bias_prior_prob=None,
207
+ layer_norm_eps=1e-5,
208
+ batch_norm_eps=1e-5,
209
+ tie_word_embeddings=True,
210
+ # backbone
211
+ backbone_config=None,
212
+ freeze_backbone_batch_norms=True,
213
+ # encoder PPDocLayoutV3HybridEncoder
214
+ encoder_hidden_dim=256,
215
+ encoder_in_channels=[512, 1024, 2048],
216
+ feat_strides=[8, 16, 32],
217
+ encoder_layers=1,
218
+ encoder_ffn_dim=1024,
219
+ encoder_attention_heads=8,
220
+ dropout=0.0,
221
+ activation_dropout=0.0,
222
+ encode_proj_layers=[2],
223
+ positional_encoding_temperature=10000,
224
+ encoder_activation_function="gelu",
225
+ activation_function="silu",
226
+ eval_size=None,
227
+ normalize_before=False,
228
+ hidden_expansion=1.0,
229
+ mask_feature_channels=[64, 64],
230
+ x4_feat_dim=128,
231
+ # decoder PPDocLayoutV3Transformer
232
+ d_model=256,
233
+ num_prototypes=32,
234
+ label_noise_ratio=0.4,
235
+ box_noise_scale=0.4,
236
+ mask_enhanced=True,
237
+ num_queries=300,
238
+ decoder_in_channels=[256, 256, 256],
239
+ decoder_ffn_dim=1024,
240
+ num_feature_levels=3,
241
+ decoder_n_points=4,
242
+ decoder_layers=6,
243
+ decoder_attention_heads=8,
244
+ decoder_activation_function="relu",
245
+ attention_dropout=0.0,
246
+ num_denoising=100,
247
+ learn_initial_query=False,
248
+ anchor_image_size=None,
249
+ disable_custom_kernels=True,
250
+ is_encoder_decoder=True,
251
+ global_pointer_head_size=64,
252
+ gp_dropout_value=0.1,
253
+ **kwargs,
254
+ ):
255
+ self.initializer_range = initializer_range
256
+ self.initializer_bias_prior_prob = initializer_bias_prior_prob
257
+ self.layer_norm_eps = layer_norm_eps
258
+ self.batch_norm_eps = batch_norm_eps
259
+ self.tie_word_embeddings = tie_word_embeddings
260
+
261
+ backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
262
+ backbone_config=backbone_config,
263
+ default_config_type="hgnet_v2",
264
+ default_config_kwargs={
265
+ "arch": "L",
266
+ "return_idx": [0, 1, 2, 3],
267
+ "freeze_stem_only": True,
268
+ "freeze_at": 0,
269
+ "freeze_norm": True,
270
+ "lr_mult_list": [0, 0.05, 0.05, 0.05, 0.05],
271
+ "out_features": ["stage1", "stage2", "stage3", "stage4"],
272
+ },
273
+ **kwargs,
274
+ )
275
+
276
+ self.backbone_config = backbone_config
277
+ self.freeze_backbone_batch_norms = freeze_backbone_batch_norms
278
+
279
+ # ---- encoder ----
280
+ self.encoder_hidden_dim = encoder_hidden_dim
281
+ self.encoder_in_channels = list(encoder_in_channels)
282
+ self.feat_strides = list(feat_strides)
283
+ self.encoder_layers = encoder_layers
284
+ self.encoder_ffn_dim = encoder_ffn_dim
285
+ self.encoder_attention_heads = encoder_attention_heads
286
+ self.dropout = dropout
287
+ self.activation_dropout = activation_dropout
288
+ self.encode_proj_layers = list(encode_proj_layers)
289
+ self.positional_encoding_temperature = positional_encoding_temperature
290
+ self.encoder_activation_function = encoder_activation_function
291
+ self.activation_function = activation_function
292
+ self.eval_size = list(eval_size) if eval_size is not None else None
293
+ self.normalize_before = normalize_before
294
+ self.hidden_expansion = hidden_expansion
295
+ self.mask_feature_channels = mask_feature_channels
296
+ self.x4_feat_dim = x4_feat_dim
297
+
298
+ # ---- decoder ----
299
+ self.d_model = d_model
300
+ self.num_queries = num_queries
301
+ self.num_prototypes = num_prototypes
302
+ self.decoder_in_channels = list(decoder_in_channels)
303
+ self.decoder_ffn_dim = decoder_ffn_dim
304
+ self.num_feature_levels = num_feature_levels
305
+ self.decoder_n_points = decoder_n_points
306
+ self.decoder_layers = decoder_layers
307
+ self.decoder_attention_heads = decoder_attention_heads
308
+ self.decoder_activation_function = decoder_activation_function
309
+ self.attention_dropout = attention_dropout
310
+ self.num_denoising = num_denoising
311
+ self.label_noise_ratio = label_noise_ratio
312
+ self.mask_enhanced = mask_enhanced
313
+ self.box_noise_scale = box_noise_scale
314
+ self.learn_initial_query = learn_initial_query
315
+ self.anchor_image_size = list(anchor_image_size) if anchor_image_size is not None else None
316
+ self.disable_custom_kernels = disable_custom_kernels
317
+ self.global_pointer_head_size = global_pointer_head_size
318
+ self.gp_dropout_value = gp_dropout_value
319
+
320
+ super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
321
+
322
+
323
+ @auto_docstring
324
+ class PPDocLayoutV3ImageProcessorFast(BaseImageProcessorFast):
325
+ resample = PILImageResampling.BICUBIC
326
+ image_mean = [0, 0, 0]
327
+ image_std = [1, 1, 1]
328
+ size = {"height": 800, "width": 800}
329
+ do_resize = True
330
+ do_rescale = True
331
+ do_normalize = True
332
+
333
+ def __init__(self, **kwargs) -> None:
334
+ super().__init__(**kwargs)
335
+
336
+ # We require `self.resize(..., antialias=False)` to approximate the output of `cv2.resize`
337
+ def _preprocess(
338
+ self,
339
+ images: list["torch.Tensor"],
340
+ do_resize: bool,
341
+ size: SizeDict,
342
+ interpolation: Optional["tvF.InterpolationMode"],
343
+ do_center_crop: bool,
344
+ crop_size: SizeDict,
345
+ do_rescale: bool,
346
+ rescale_factor: float,
347
+ do_normalize: bool,
348
+ image_mean: float | list[float] | None,
349
+ image_std: float | list[float] | None,
350
+ do_pad: bool | None,
351
+ pad_size: SizeDict | None,
352
+ disable_grouping: bool | None,
353
+ return_tensors: str | TensorType | None,
354
+ **kwargs,
355
+ ) -> BatchFeature:
356
+ # Group images by size for batched resizing
357
+ grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
358
+ resized_images_grouped = {}
359
+ for shape, stacked_images in grouped_images.items():
360
+ if do_resize:
361
+ stacked_images = self.resize(
362
+ image=stacked_images, size=size, interpolation=interpolation, antialias=False
363
+ )
364
+ resized_images_grouped[shape] = stacked_images
365
+ resized_images = reorder_images(resized_images_grouped, grouped_images_index)
366
+
367
+ # Group images by size for further processing
368
+ # Needed in case do_resize is False, or resize returns images with different sizes
369
+ grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
370
+ processed_images_grouped = {}
371
+ for shape, stacked_images in grouped_images.items():
372
+ if do_center_crop:
373
+ stacked_images = self.center_crop(stacked_images, crop_size)
374
+ # Fused rescale and normalize
375
+ stacked_images = self.rescale_and_normalize(
376
+ stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
377
+ )
378
+ processed_images_grouped[shape] = stacked_images
379
+ processed_images = reorder_images(processed_images_grouped, grouped_images_index)
380
+
381
+ if do_pad:
382
+ processed_images = self.pad(processed_images, pad_size=pad_size, disable_grouping=disable_grouping)
383
+
384
+ return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
385
+
386
+ def _get_order_seqs(self, order_logits):
387
+ """
388
+ Computes the order sequences for a batch of inputs based on logits.
389
+
390
+ This function takes in the order logits, calculates order scores using a sigmoid activation,
391
+ and determines the order sequences by ranking the votes derived from the scores.
392
+
393
+ Args:
394
+ order_logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_queries)`):
395
+ Stacked order logits.
396
+
397
+ Returns:
398
+ torch.Tensor: A tensor of shape `(batch_size, num_queries)`:
399
+ Containing the computed order sequences for each input in the batch. Each row represents the ranked order of elements for the corresponding input in the batch.
400
+ """
401
+ order_scores = torch.sigmoid(order_logits)
402
+ batch_size, sequence_length, _ = order_scores.shape
403
+
404
+ order_votes = order_scores.triu(diagonal=1).sum(dim=1) + (1.0 - order_scores.transpose(1, 2)).tril(
405
+ diagonal=-1
406
+ ).sum(dim=1)
407
+
408
+ order_pointers = torch.argsort(order_votes, dim=1)
409
+ order_seq = torch.empty_like(order_pointers)
410
+ ranks = torch.arange(sequence_length, device=order_pointers.device, dtype=order_pointers.dtype).expand(
411
+ batch_size, -1
412
+ )
413
+ order_seq.scatter_(1, order_pointers, ranks)
414
+
415
+ return order_seq
416
+
417
+ def extract_custom_vertices(self, polygon, sharp_angle_thresh=45):
418
+ poly = np.array(polygon)
419
+ n = len(poly)
420
+ res = []
421
+ i = 0
422
+ while i < n:
423
+ previous_point = poly[(i - 1) % n]
424
+ current_point = poly[i]
425
+ next_point = poly[(i + 1) % n]
426
+ vector_1 = previous_point - current_point
427
+ vector_2 = next_point - current_point
428
+ cross_product_value = (vector_1[1] * vector_2[0]) - (vector_1[0] * vector_2[1])
429
+ if cross_product_value < 0:
430
+ angle_cos = np.clip(
431
+ (vector_1 @ vector_2) / (np.linalg.norm(vector_1) * np.linalg.norm(vector_2)), -1.0, 1.0
432
+ )
433
+ angle = np.degrees(np.arccos(angle_cos))
434
+ if abs(angle - sharp_angle_thresh) < 1:
435
+ # Calculate the new point based on the direction of two vectors.
436
+ dir_vec = vector_1 / np.linalg.norm(vector_1) + vector_2 / np.linalg.norm(vector_2)
437
+ dir_vec = dir_vec / np.linalg.norm(dir_vec)
438
+ step_size = (np.linalg.norm(vector_1) + np.linalg.norm(vector_2)) / 2
439
+ new_point = current_point + dir_vec * step_size
440
+ res.append(tuple(new_point))
441
+ else:
442
+ res.append(tuple(current_point))
443
+ i += 1
444
+ return res
445
+
446
+ def _mask2polygon(self, mask, epsilon_ratio=0.004):
447
+ """
448
+ Postprocess mask by removing small noise.
449
+ Args:
450
+ mask (ndarray): The input mask of shape [H, W].
451
+ epsilon_ratio (float): The ratio of epsilon.
452
+ Returns:
453
+ ndarray: The output mask after postprocessing.
454
+ """
455
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
456
+
457
+ if not contours:
458
+ return None
459
+
460
+ contours = max(contours, key=cv2.contourArea)
461
+ epsilon = epsilon_ratio * cv2.arcLength(contours, True)
462
+ approx_contours = cv2.approxPolyDP(contours, epsilon, True)
463
+ polygon_points = approx_contours.squeeze()
464
+ polygon_points = np.atleast_2d(polygon_points)
465
+
466
+ polygon_points = self.extract_custom_vertices(polygon_points)
467
+
468
+ return polygon_points
469
+
470
+ def _extract_polygon_points_by_masks(self, boxes, masks, scale_ratio):
471
+ scale_width, scale_height = scale_ratio[0] / 4, scale_ratio[1] / 4
472
+ mask_height, mask_width = masks.shape[1:]
473
+ polygon_points = []
474
+
475
+ for i in range(len(boxes)):
476
+ x_min, y_min, x_max, y_max = boxes[i].astype(np.int32)
477
+ box_w, box_h = x_max - x_min, y_max - y_min
478
+
479
+ # default rect
480
+ rect = np.array(
481
+ [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]],
482
+ dtype=np.float32,
483
+ )
484
+
485
+ if box_w <= 0 or box_h <= 0:
486
+ polygon_points.append(rect)
487
+ continue
488
+
489
+ # crop mask
490
+ x_coordinates = [int(round((x_min * scale_width).item())), int(round((x_max * scale_width).item()))]
491
+ x_start, x_end = np.clip(x_coordinates, 0, mask_width)
492
+ y_coordinates = [int(round((y_min * scale_height).item())), int(round((y_max * scale_height).item()))]
493
+ y_start, y_end = np.clip(y_coordinates, 0, mask_height)
494
+ cropped_mask = masks[i, y_start:y_end, x_start:x_end]
495
+
496
+ # resize mask to match box size
497
+ resized_mask = cv2.resize(cropped_mask.astype(np.uint8), (box_w, box_h), interpolation=cv2.INTER_NEAREST)
498
+
499
+ polygon = self._mask2polygon(resized_mask)
500
+ if polygon is not None and len(polygon) < 4:
501
+ polygon_points.append(rect)
502
+ continue
503
+ if polygon is not None and len(polygon) > 0:
504
+ polygon = polygon + np.array([x_min, y_min])
505
+
506
+ polygon_points.append(polygon)
507
+
508
+ return polygon_points
509
+
510
+ def post_process_object_detection(
511
+ self,
512
+ outputs,
513
+ threshold: float = 0.5,
514
+ target_sizes: TensorType | list[tuple] | None = None,
515
+ ):
516
+ """
517
+ Converts the raw output of [`PPDocLayoutV3ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
518
+ bottom_right_x, bottom_right_y) format. Only supports PyTorch.
519
+
520
+ Args:
521
+ outputs ([`DetrObjectDetectionOutput`]):
522
+ Raw outputs of the model.
523
+ Returns:
524
+ `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and polygon_points for an image
525
+ in the batch as predicted by the model.
526
+ """
527
+ requires_backends(self, ["torch", "cv2"])
528
+ boxes = outputs.pred_boxes
529
+ logits = outputs.logits
530
+ order_logits = outputs.order_logits
531
+ masks = outputs.out_masks
532
+
533
+ order_seqs = self._get_order_seqs(order_logits)
534
+
535
+ box_centers, box_dims = torch.split(boxes, 2, dim=-1)
536
+ top_left_coords = box_centers - 0.5 * box_dims
537
+ bottom_right_coords = box_centers + 0.5 * box_dims
538
+ boxes = torch.cat([top_left_coords, bottom_right_coords], dim=-1)
539
+
540
+ if target_sizes is not None:
541
+ if len(logits) != len(target_sizes):
542
+ raise ValueError(
543
+ "Make sure that you pass in as many target sizes as the batch dimension of the logits"
544
+ )
545
+ if isinstance(target_sizes, list):
546
+ img_height, img_width = torch.as_tensor(target_sizes).unbind(1)
547
+ else:
548
+ img_height, img_width = target_sizes.unbind(1)
549
+ scale_factor = torch.stack([img_width, img_height, img_width, img_height], dim=1).to(boxes.device)
550
+ boxes = boxes * scale_factor[:, None, :]
551
+
552
+ num_top_queries = logits.shape[1]
553
+ num_classes = logits.shape[2]
554
+
555
+ scores = torch.nn.functional.sigmoid(logits)
556
+ scores, index = torch.topk(scores.flatten(1), num_top_queries, dim=-1)
557
+ labels = index % num_classes
558
+ index = index // num_classes
559
+ boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
560
+ masks = masks.gather(
561
+ dim=1, index=index.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, masks.shape[-2], masks.shape[-1])
562
+ )
563
+ masks = (masks.sigmoid() > threshold).int()
564
+ order_seqs = order_seqs.gather(dim=1, index=index)
565
+
566
+ results = []
567
+ for score, label, box, order_seq, target_size, mask in zip(
568
+ scores, labels, boxes, order_seqs, target_sizes, masks
569
+ ):
570
+ order_seq = order_seq[score >= threshold]
571
+ order_seq, indices = torch.sort(order_seq)
572
+ polygon_points = self._extract_polygon_points_by_masks(
573
+ box[score >= threshold][indices].detach().cpu().numpy(),
574
+ mask[score >= threshold][indices].detach().cpu().numpy(),
575
+ [self.size["width"] / target_size[1], self.size["height"] / target_size[0]],
576
+ )
577
+ results.append(
578
+ {
579
+ "scores": score[score >= threshold][indices],
580
+ "labels": label[score >= threshold][indices],
581
+ "boxes": box[score >= threshold][indices],
582
+ "polygon_points": polygon_points,
583
+ "order_seq": order_seq,
584
+ }
585
+ )
586
+
587
+ return results
588
+
589
+
590
+ class PPDocLayoutV3GlobalPointer(nn.Module):
591
+ def __init__(self, config):
592
+ super().__init__()
593
+ self.head_size = config.global_pointer_head_size
594
+ self.dense = nn.Linear(config.d_model, self.head_size * 2)
595
+ self.dropout = nn.Dropout(config.gp_dropout_value)
596
+
597
+ def forward(self, inputs):
598
+ batch_size, sequence_length, _ = inputs.shape
599
+ query_key_projection = self.dense(inputs).reshape(batch_size, sequence_length, 2, self.head_size)
600
+ query_key_projection = self.dropout(query_key_projection)
601
+ queries, keys = torch.unbind(query_key_projection, dim=2)
602
+
603
+ logits = (queries @ keys.transpose(-2, -1)) / (self.head_size**0.5)
604
+ mask = torch.tril(torch.ones(sequence_length, sequence_length, device=logits.device)).bool()
605
+ logits = logits.masked_fill(mask.unsqueeze(0), -1e4)
606
+
607
+ return logits
608
+
609
+
610
+ class PPDocLayoutV3MultiscaleDeformableAttention(RTDetrMultiscaleDeformableAttention):
611
+ pass
612
+
613
+
614
+ @auto_docstring
615
+ class PPDocLayoutV3PreTrainedModel(RTDetrPreTrainedModel):
616
+ @torch.no_grad()
617
+ def _init_weights(self, module):
618
+ """Initialize the weights"""
619
+ if isinstance(module, PPDocLayoutV3MultiscaleDeformableAttention):
620
+ init.constant_(module.sampling_offsets.weight, 0.0)
621
+ default_dtype = torch.get_default_dtype()
622
+ thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
623
+ 2.0 * math.pi / module.n_heads
624
+ )
625
+ grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
626
+ grid_init = (
627
+ (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
628
+ .view(module.n_heads, 1, 1, 2)
629
+ .repeat(1, module.n_levels, module.n_points, 1)
630
+ )
631
+ for i in range(module.n_points):
632
+ grid_init[:, :, i, :] *= i + 1
633
+
634
+ init.copy_(module.sampling_offsets.bias, grid_init.view(-1))
635
+ init.constant_(module.attention_weights.weight, 0.0)
636
+ init.constant_(module.attention_weights.bias, 0.0)
637
+ init.xavier_uniform_(module.value_proj.weight)
638
+ init.constant_(module.value_proj.bias, 0.0)
639
+ init.xavier_uniform_(module.output_proj.weight)
640
+ init.constant_(module.output_proj.bias, 0.0)
641
+
642
+ elif isinstance(module, PPDocLayoutV3Model):
643
+ prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
644
+ bias = float(-math.log((1 - prior_prob) / prior_prob))
645
+ init.xavier_uniform_(module.enc_score_head.weight)
646
+ init.constant_(module.enc_score_head.bias, bias)
647
+ init.xavier_uniform_(module.decoder.class_embed.weight)
648
+ init.constant_(module.decoder.class_embed.bias, bias)
649
+
650
+ elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
651
+ init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
652
+ if module.bias is not None:
653
+ init.zeros_(module.bias)
654
+ if getattr(module, "running_mean", None) is not None:
655
+ init.zeros_(module.running_mean)
656
+ init.ones_(module.running_var)
657
+ init.zeros_(module.num_batches_tracked)
658
+
659
+ elif isinstance(module, nn.LayerNorm):
660
+ init.ones_(module.weight)
661
+ init.zeros_(module.bias)
662
+
663
+ if isinstance(module, nn.Embedding):
664
+ init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
665
+ if module.padding_idx is not None:
666
+ init.zeros_(module.weight.data[module.padding_idx])
667
+
668
+
669
+ def mask_to_box_coordinate(mask, dtype):
670
+ mask = mask.bool()
671
+
672
+ height, width = mask.shape[-2:]
673
+
674
+ y_coords, x_coords = torch.meshgrid(
675
+ torch.arange(height, device=mask.device), torch.arange(width, device=mask.device), indexing="ij"
676
+ )
677
+ x_coords = x_coords.to(dtype)
678
+ y_coords = y_coords.to(dtype)
679
+
680
+ x_coords_masked = x_coords * mask
681
+ x_max = x_coords_masked.flatten(start_dim=-2).max(dim=-1).values + 1
682
+ x_min = (
683
+ torch.where(mask, x_coords_masked, torch.tensor(torch.finfo(dtype).max))
684
+ .flatten(start_dim=-2)
685
+ .min(dim=-1)
686
+ .values
687
+ )
688
+
689
+ y_coords_masked = y_coords * mask
690
+ y_max = y_coords_masked.flatten(start_dim=-2).max(dim=-1).values + 1
691
+ y_min = (
692
+ torch.where(mask, y_coords_masked, torch.tensor(torch.finfo(dtype).max))
693
+ .flatten(start_dim=-2)
694
+ .min(dim=-1)
695
+ .values
696
+ )
697
+
698
+ unnormalized_bbox = torch.stack([x_min, y_min, x_max, y_max], dim=-1)
699
+
700
+ is_mask_non_empty = torch.any(mask, dim=(-2, -1)).unsqueeze(-1)
701
+ unnormalized_bbox = unnormalized_bbox * is_mask_non_empty
702
+
703
+ norm_tensor = torch.tensor([width, height, width, height], device=mask.device, dtype=dtype)
704
+ normalized_bbox_xyxy = unnormalized_bbox / norm_tensor
705
+
706
+ x_min_norm, y_min_norm, x_max_norm, y_max_norm = normalized_bbox_xyxy.unbind(dim=-1)
707
+
708
+ center_x = (x_min_norm + x_max_norm) / 2
709
+ center_y = (y_min_norm + y_max_norm) / 2
710
+ box_width = x_max_norm - x_min_norm
711
+ box_height = y_max_norm - y_min_norm
712
+
713
+ return torch.stack([center_x, center_y, box_width, box_height], dim=-1)
714
+
715
+
716
+ @dataclass
717
+ class PPDocLayoutV3DecoderOutput(RTDetrDecoderOutput):
718
+ r"""
719
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
720
+ Stacked intermediate hidden states (output of each layer of the decoder).
721
+ intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
722
+ Stacked intermediate logits (logits of each layer of the decoder).
723
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
724
+ Stacked intermediate reference points (reference points of each layer of the decoder).
725
+ intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
726
+ Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
727
+ initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
728
+ Stacked initial reference points (initial reference points of each layer of the decoder).
729
+ cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
730
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
731
+ sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
732
+ used to compute the weighted average in the cross-attention heads.
733
+ decoder_out_order_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.num_queries, config.num_queries)`):
734
+ Stacked order logits (order logits of each layer of the decoder).
735
+ decoder_out_masks (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.num_queries, 200, 200)`):
736
+ Stacked masks (masks of each layer of the decoder).
737
+ """
738
+
739
+ decoder_out_order_logits: torch.FloatTensor | None = None
740
+ decoder_out_masks: torch.FloatTensor | None = None
741
+
742
+
743
+ @dataclass
744
+ @auto_docstring(
745
+ custom_intro="""
746
+ Base class for outputs of the PP-DocLayoutV3 model.
747
+ """
748
+ )
749
+ class PPDocLayoutV3ModelOutput(RTDetrModelOutput):
750
+ r"""
751
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
752
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
753
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
754
+ Stacked intermediate hidden states (output of each layer of the decoder).
755
+ intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
756
+ Stacked intermediate logits (logits of each layer of the decoder).
757
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
758
+ Stacked intermediate reference points (reference points of each layer of the decoder).
759
+ intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
760
+ Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
761
+ initial_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
762
+ Initial reference points used for the first decoder layer.
763
+ init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
764
+ Initial reference points sent through the Transformer decoder.
765
+ enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
766
+ Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
767
+ picked as region proposals in the encoder stage. Output of bounding box binary classification (i.e.
768
+ foreground and background).
769
+ enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`):
770
+ Logits of predicted bounding boxes coordinates in the encoder stage.
771
+ enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
772
+ Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
773
+ picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
774
+ foreground and background).
775
+ enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
776
+ Logits of predicted bounding boxes coordinates in the first stage.
777
+ denoising_meta_values (`dict`):
778
+ Extra dictionary for the denoising related values.
779
+ out_order_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.num_queries, config.num_queries)`):
780
+ Stacked order logits (order logits of each layer of the decoder).
781
+ out_masks (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.num_queries, 200, 200)`):
782
+ Stacked masks (masks of each layer of the decoder).
783
+ """
784
+
785
+ out_order_logits: torch.FloatTensor | None = None
786
+ out_masks: torch.FloatTensor | None = None
787
+
788
+
789
+ class PPDocLayoutV3MLPPredictionHead(RTDetrMLPPredictionHead):
790
+ pass
791
+
792
+
793
+ class PPDocLayoutV3ConvLayer(ResNetConvLayer):
794
+ pass
795
+
796
+
797
+ class PPDocLayoutV3ScaleHead(nn.Module):
798
+ def __init__(self, in_channels, feature_channels, fpn_stride, base_stride, align_corners=False):
799
+ super().__init__()
800
+ head_length = max(1, int(np.log2(fpn_stride) - np.log2(base_stride)))
801
+ self.layers = nn.ModuleList()
802
+ for k in range(head_length):
803
+ in_c = in_channels if k == 0 else feature_channels
804
+ self.layers.append(PPDocLayoutV3ConvLayer(in_c, feature_channels, 3, 1, "silu"))
805
+ if fpn_stride != base_stride:
806
+ self.layers.append(nn.Upsample(scale_factor=2, mode="bilinear", align_corners=align_corners))
807
+
808
+ def forward(self, x):
809
+ for layer in self.layers:
810
+ x = layer(x)
811
+ return x
812
+
813
+
814
+ class PPDocLayoutV3MaskFeatFPN(nn.Module):
815
+ def __init__(
816
+ self,
817
+ in_channels=[256, 256, 256],
818
+ fpn_strides=[32, 16, 8],
819
+ feature_channels=256,
820
+ dropout_ratio=0.0,
821
+ out_channels=256,
822
+ align_corners=False,
823
+ ):
824
+ super().__init__()
825
+
826
+ reorder_index = np.argsort(fpn_strides, axis=0).tolist()
827
+ in_channels = [in_channels[i] for i in reorder_index]
828
+ fpn_strides = [fpn_strides[i] for i in reorder_index]
829
+
830
+ self.reorder_index = reorder_index
831
+ self.fpn_strides = fpn_strides
832
+ self.dropout_ratio = dropout_ratio
833
+ self.align_corners = align_corners
834
+ if self.dropout_ratio > 0:
835
+ self.dropout = nn.Dropout2d(dropout_ratio)
836
+
837
+ self.scale_heads = nn.ModuleList()
838
+ for i in range(len(fpn_strides)):
839
+ self.scale_heads.append(
840
+ PPDocLayoutV3ScaleHead(
841
+ in_channels=in_channels[i],
842
+ feature_channels=feature_channels,
843
+ fpn_stride=fpn_strides[i],
844
+ base_stride=fpn_strides[0],
845
+ align_corners=align_corners,
846
+ )
847
+ )
848
+ self.output_conv = PPDocLayoutV3ConvLayer(feature_channels, out_channels, 3, 1, "silu")
849
+
850
+ def forward(self, inputs):
851
+ x = [inputs[i] for i in self.reorder_index]
852
+
853
+ output = self.scale_heads[0](x[0])
854
+ for i in range(1, len(self.fpn_strides)):
855
+ output = output + F.interpolate(
856
+ self.scale_heads[i](x[i]), size=output.shape[2:], mode="bilinear", align_corners=self.align_corners
857
+ )
858
+
859
+ if self.dropout_ratio > 0:
860
+ output = self.dropout(output)
861
+ output = self.output_conv(output)
862
+ return output
863
+
864
+
865
+ class PPDocLayoutV3EncoderMaskOutput(nn.Module):
866
+ def __init__(self, in_channels, num_prototypes):
867
+ super().__init__()
868
+ self.base_conv = PPDocLayoutV3ConvLayer(in_channels, in_channels, 3, 1, "silu")
869
+ self.conv = nn.Conv2d(in_channels, num_prototypes, kernel_size=1)
870
+
871
+ def forward(self, x):
872
+ x = self.base_conv(x)
873
+ x = self.conv(x)
874
+ return x
875
+
876
+
877
+ class PPDocLayoutV3HybridEncoder(RTDetrHybridEncoder):
878
+ """
879
+ Main difference to `RTDetrHybridEncoder`:
880
+ 1. Mask Feature Head: Added `PPDocLayoutV3MaskFeatFPN` module (`self.mask_feature_head`) for document - specific mask feature generation.
881
+ 2. Extra Conv Layers: Introduced `self.encoder_mask_lateral` and `self.encoder_mask_output` for mask feature processing and output.
882
+ """
883
+
884
+ def __init__(self, config: PPDocLayoutV3Config):
885
+ super().__init__()
886
+
887
+ feat_strides = config.feat_strides
888
+ mask_feature_channels = config.mask_feature_channels
889
+ self.mask_feature_head = PPDocLayoutV3MaskFeatFPN(
890
+ [self.encoder_hidden_dim] * len(feat_strides),
891
+ feat_strides,
892
+ feature_channels=mask_feature_channels[0],
893
+ out_channels=mask_feature_channels[1],
894
+ )
895
+ self.encoder_mask_lateral = PPDocLayoutV3ConvLayer(config.x4_feat_dim, mask_feature_channels[1], 3, 1, "silu")
896
+ self.encoder_mask_output = PPDocLayoutV3EncoderMaskOutput(
897
+ in_channels=mask_feature_channels[1], num_prototypes=config.num_prototypes
898
+ )
899
+
900
+ def forward(
901
+ self,
902
+ inputs_embeds=None,
903
+ x4_feat=None,
904
+ **kwargs: Unpack[TransformersKwargs],
905
+ ):
906
+ r"""
907
+ Args:
908
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
909
+ Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
910
+ """
911
+ feature_maps = inputs_embeds
912
+
913
+ # AIFI: Apply transformer encoder to specified feature levels
914
+ if self.config.encoder_layers > 0:
915
+ for i, enc_ind in enumerate(self.encode_proj_layers):
916
+ feature_maps[enc_ind] = self.aifi[i](feature_maps[enc_ind], **kwargs)
917
+
918
+ # top-down FPN
919
+ fpn_feature_maps = [feature_maps[-1]]
920
+ for idx, (lateral_conv, fpn_block) in enumerate(zip(self.lateral_convs, self.fpn_blocks)):
921
+ backbone_feature_map = feature_maps[self.num_fpn_stages - idx - 1]
922
+ top_fpn_feature_map = fpn_feature_maps[-1]
923
+ # apply lateral block
924
+ top_fpn_feature_map = lateral_conv(top_fpn_feature_map)
925
+ fpn_feature_maps[-1] = top_fpn_feature_map
926
+ # apply fpn block
927
+ top_fpn_feature_map = F.interpolate(top_fpn_feature_map, scale_factor=2.0, mode="nearest")
928
+ fused_feature_map = torch.concat([top_fpn_feature_map, backbone_feature_map], dim=1)
929
+ new_fpn_feature_map = fpn_block(fused_feature_map)
930
+ fpn_feature_maps.append(new_fpn_feature_map)
931
+
932
+ fpn_feature_maps.reverse()
933
+
934
+ # bottom-up PAN
935
+ pan_feature_maps = [fpn_feature_maps[0]]
936
+ for idx, (downsample_conv, pan_block) in enumerate(zip(self.downsample_convs, self.pan_blocks)):
937
+ top_pan_feature_map = pan_feature_maps[-1]
938
+ fpn_feature_map = fpn_feature_maps[idx + 1]
939
+ downsampled_feature_map = downsample_conv(top_pan_feature_map)
940
+ fused_feature_map = torch.concat([downsampled_feature_map, fpn_feature_map], dim=1)
941
+ new_pan_feature_map = pan_block(fused_feature_map)
942
+ pan_feature_maps.append(new_pan_feature_map)
943
+
944
+ mask_feat = self.mask_feature_head(pan_feature_maps)
945
+ mask_feat = F.interpolate(mask_feat, scale_factor=2, mode="bilinear", align_corners=False)
946
+ mask_feat += self.encoder_mask_lateral(x4_feat[0])
947
+ mask_feat = self.encoder_mask_output(mask_feat)
948
+
949
+ return PPDocLayoutV3HybridEncoderOutput(
950
+ last_hidden_state=pan_feature_maps,
951
+ mask_feat=mask_feat,
952
+ )
953
+
954
+
955
+ class PPDocLayoutV3Decoder(RTDetrDecoder):
956
+ """
957
+ Main difference to `RTDetrDecoder`:
958
+ A new mask generation process is introduced at each decoder layer.
959
+ """
960
+
961
+ def __init__(self, config: PPDocLayoutV3Config):
962
+ super().__init__()
963
+
964
+ self.num_queries = config.num_queries
965
+
966
+ def forward(
967
+ self,
968
+ inputs_embeds=None,
969
+ encoder_hidden_states=None,
970
+ encoder_attention_mask=None,
971
+ reference_points=None,
972
+ spatial_shapes=None,
973
+ spatial_shapes_list=None,
974
+ level_start_index=None,
975
+ order_head=None,
976
+ global_pointer=None,
977
+ mask_query_head=None,
978
+ norm=None,
979
+ mask_feat=None,
980
+ **kwargs: Unpack[TransformersKwargs],
981
+ ):
982
+ r"""
983
+ Args:
984
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
985
+ The query embeddings that are passed into the decoder.
986
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
987
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
988
+ of the decoder.
989
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
990
+ Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
991
+ in `[0, 1]`:
992
+ - 1 for pixels that are real (i.e. **not masked**),
993
+ - 0 for pixels that are padding (i.e. **masked**).
994
+ reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
995
+ Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
996
+ spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
997
+ Spatial shapes of the feature maps.
998
+ level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
999
+ Indexes for the start of each feature level. In range `[0, sequence_length]`.
1000
+ """
1001
+ if inputs_embeds is not None:
1002
+ hidden_states = inputs_embeds
1003
+
1004
+ # decoder layers
1005
+ intermediate = ()
1006
+ intermediate_reference_points = ()
1007
+ intermediate_logits = ()
1008
+ decoder_out_order_logits = ()
1009
+ decoder_out_masks = ()
1010
+
1011
+ reference_points = F.sigmoid(reference_points)
1012
+
1013
+ # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L252
1014
+ for idx, decoder_layer in enumerate(self.layers):
1015
+ reference_points_input = reference_points.unsqueeze(2)
1016
+ object_queries_position_embeddings = self.query_pos_head(reference_points)
1017
+
1018
+ hidden_states = decoder_layer(
1019
+ hidden_states,
1020
+ object_queries_position_embeddings=object_queries_position_embeddings,
1021
+ encoder_hidden_states=encoder_hidden_states,
1022
+ reference_points=reference_points_input,
1023
+ spatial_shapes=spatial_shapes,
1024
+ spatial_shapes_list=spatial_shapes_list,
1025
+ level_start_index=level_start_index,
1026
+ encoder_attention_mask=encoder_attention_mask,
1027
+ **kwargs,
1028
+ )
1029
+
1030
+ # hack implementation for iterative bounding box refinement
1031
+ if self.bbox_embed is not None:
1032
+ predicted_corners = self.bbox_embed(hidden_states)
1033
+ new_reference_points = F.sigmoid(predicted_corners + inverse_sigmoid(reference_points))
1034
+ reference_points = new_reference_points.detach()
1035
+
1036
+ intermediate += (hidden_states,)
1037
+ intermediate_reference_points += (
1038
+ (new_reference_points,) if self.bbox_embed is not None else (reference_points,)
1039
+ )
1040
+
1041
+ # get_pred_class_order_and_mask
1042
+ out_query = norm(hidden_states)
1043
+ mask_query_embed = mask_query_head(out_query)
1044
+ batch_size, mask_dim, _ = mask_query_embed.shape
1045
+ _, _, mask_h, mask_w = mask_feat.shape
1046
+ out_mask = torch.bmm(mask_query_embed, mask_feat.flatten(start_dim=2)).reshape(
1047
+ batch_size, mask_dim, mask_h, mask_w
1048
+ )
1049
+ decoder_out_masks += (out_mask,)
1050
+
1051
+ if self.class_embed is not None:
1052
+ logits = self.class_embed(out_query)
1053
+ intermediate_logits += (logits,)
1054
+
1055
+ if order_head is not None and global_pointer is not None:
1056
+ valid_query = out_query[:, -self.num_queries :] if self.num_queries is not None else out_query
1057
+ order_logits = global_pointer(order_head[idx](valid_query))
1058
+ decoder_out_order_logits += (order_logits,)
1059
+
1060
+ # Keep batch_size as first dimension
1061
+ intermediate = torch.stack(intermediate, dim=1)
1062
+ intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
1063
+ if self.class_embed is not None:
1064
+ intermediate_logits = torch.stack(intermediate_logits, dim=1)
1065
+ if order_head is not None and global_pointer is not None:
1066
+ decoder_out_order_logits = torch.stack(decoder_out_order_logits, dim=1)
1067
+ decoder_out_masks = torch.stack(decoder_out_masks, dim=1)
1068
+
1069
+ return PPDocLayoutV3DecoderOutput(
1070
+ last_hidden_state=hidden_states,
1071
+ intermediate_hidden_states=intermediate,
1072
+ intermediate_logits=intermediate_logits,
1073
+ intermediate_reference_points=intermediate_reference_points,
1074
+ decoder_out_order_logits=decoder_out_order_logits,
1075
+ decoder_out_masks=decoder_out_masks,
1076
+ )
1077
+
1078
+
1079
+ @auto_docstring(
1080
+ custom_intro="""
1081
+ PP-DocLayoutV3 Model (consisting of a backbone and encoder-decoder) outputting raw hidden states without any head on top.
1082
+ """
1083
+ )
1084
+ class PPDocLayoutV3Model(RTDetrModel):
1085
+ _tied_weights_keys = {
1086
+ "decoder.class_embed": "enc_score_head",
1087
+ "decoder.bbox_embed": "enc_bbox_head",
1088
+ }
1089
+
1090
+ def __init__(self, config: PPDocLayoutV3Config):
1091
+ super().__init__(config)
1092
+
1093
+ encoder_input_proj_list = []
1094
+ self.encoder_input_proj = nn.ModuleList(encoder_input_proj_list[1:])
1095
+
1096
+ self.decoder_order_head = nn.ModuleList(
1097
+ [nn.Linear(config.d_model, config.d_model) for _ in range(config.decoder_layers)]
1098
+ )
1099
+ self.decoder_global_pointer = PPDocLayoutV3GlobalPointer(config)
1100
+ self.decoder_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
1101
+ self.decoder = PPDocLayoutV3Decoder(config)
1102
+ self.decoder.class_embed = nn.Linear(config.d_model, config.num_labels)
1103
+ self.decoder.bbox_embed = PPDocLayoutV3MLPPredictionHead(config.d_model, config.d_model, 4, num_layers=3)
1104
+
1105
+ self.mask_enhanced = config.mask_enhanced
1106
+ self.mask_query_head = PPDocLayoutV3MLPPredictionHead(
1107
+ config.d_model, config.d_model, config.num_prototypes, num_layers=3
1108
+ )
1109
+
1110
+ @auto_docstring
1111
+ @can_return_tuple
1112
+ def forward(
1113
+ self,
1114
+ pixel_values: torch.FloatTensor,
1115
+ pixel_mask: torch.LongTensor | None = None,
1116
+ encoder_outputs: torch.FloatTensor | None = None,
1117
+ labels: list[dict] | None = None,
1118
+ **kwargs: Unpack[TransformersKwargs],
1119
+ ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ModelOutput:
1120
+ r"""
1121
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1122
+ Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
1123
+ can choose to directly pass a flattened representation of an image.
1124
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
1125
+ Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
1126
+ embedded representation.
1127
+ labels (`list[Dict]` of len `(batch_size,)`, *optional*):
1128
+ Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
1129
+ following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
1130
+ respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
1131
+ in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
1132
+
1133
+ Examples:
1134
+
1135
+ ```python
1136
+ >>> from transformers import AutoImageProcessor, PPDocLayoutV2Model
1137
+ >>> from PIL import Image
1138
+ >>> import requests
1139
+
1140
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1141
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1142
+
1143
+ >>> image_processor = AutoImageProcessor.from_pretrained("PekingU/PPDocLayoutV2_r50vd")
1144
+ >>> model = PPDocLayoutV2Model.from_pretrained("PekingU/PPDocLayoutV2_r50vd")
1145
+
1146
+ >>> inputs = image_processor(images=image, return_tensors="pt")
1147
+
1148
+ >>> outputs = model(**inputs)
1149
+
1150
+ >>> last_hidden_states = outputs.last_hidden_state
1151
+ >>> list(last_hidden_states.shape)
1152
+ [1, 300, 256]
1153
+ ```"""
1154
+ batch_size, num_channels, height, width = pixel_values.shape
1155
+ device = pixel_values.device
1156
+
1157
+ if pixel_mask is None:
1158
+ pixel_mask = torch.ones(((batch_size, height, width)), device=device)
1159
+
1160
+ features = self.backbone(pixel_values, pixel_mask)
1161
+ x4_feat = features.pop(0)
1162
+ proj_feats = [self.encoder_input_proj[level](source) for level, (source, mask) in enumerate(features)]
1163
+
1164
+ if encoder_outputs is None:
1165
+ encoder_outputs = self.encoder(
1166
+ proj_feats,
1167
+ x4_feat,
1168
+ **kwargs,
1169
+ )
1170
+ # If the user passed a tuple for encoder_outputs, we wrap it in a PPDocLayoutV3HybridEncoderOutput when return_dict=True
1171
+ elif not isinstance(encoder_outputs, PPDocLayoutV3HybridEncoderOutput):
1172
+ encoder_outputs = PPDocLayoutV3HybridEncoderOutput(
1173
+ last_hidden_state=encoder_outputs[0],
1174
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
1175
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
1176
+ mask_feat=encoder_outputs[-1],
1177
+ )
1178
+
1179
+ # Equivalent to def _get_encoder_input
1180
+ # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L412
1181
+ sources = []
1182
+ for level, source in enumerate(encoder_outputs.last_hidden_state):
1183
+ sources.append(self.decoder_input_proj[level](source))
1184
+
1185
+ # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
1186
+ if self.config.num_feature_levels > len(sources):
1187
+ _len_sources = len(sources)
1188
+ sources.append(self.decoder_input_proj[_len_sources](encoder_outputs.last_hidden_state[-1]))
1189
+ for i in range(_len_sources + 1, self.config.num_feature_levels):
1190
+ sources.append(self.decoder_input_proj[i](encoder_outputs.last_hidden_state[-1]))
1191
+
1192
+ # Prepare encoder inputs (by flattening)
1193
+ source_flatten = []
1194
+ spatial_shapes_list = []
1195
+ spatial_shapes = torch.empty((len(sources), 2), device=device, dtype=torch.long)
1196
+ for level, source in enumerate(sources):
1197
+ height, width = source.shape[-2:]
1198
+ spatial_shapes[level, 0] = height
1199
+ spatial_shapes[level, 1] = width
1200
+ spatial_shapes_list.append((height, width))
1201
+ source = source.flatten(2).transpose(1, 2)
1202
+ source_flatten.append(source)
1203
+ source_flatten = torch.cat(source_flatten, 1)
1204
+ level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
1205
+
1206
+ # prepare denoising training
1207
+ if self.training and self.config.num_denoising > 0 and labels is not None:
1208
+ (
1209
+ denoising_class,
1210
+ denoising_bbox_unact,
1211
+ attention_mask,
1212
+ denoising_meta_values,
1213
+ ) = get_contrastive_denoising_training_group(
1214
+ targets=labels,
1215
+ num_classes=self.config.num_labels,
1216
+ num_queries=self.config.num_queries,
1217
+ class_embed=self.denoising_class_embed,
1218
+ num_denoising_queries=self.config.num_denoising,
1219
+ label_noise_ratio=self.config.label_noise_ratio,
1220
+ box_noise_scale=self.config.box_noise_scale,
1221
+ )
1222
+ else:
1223
+ denoising_class, denoising_bbox_unact, attention_mask, denoising_meta_values = None, None, None, None
1224
+
1225
+ batch_size = len(source_flatten)
1226
+ device = source_flatten.device
1227
+ dtype = source_flatten.dtype
1228
+
1229
+ # prepare input for decoder
1230
+ if self.training or self.config.anchor_image_size is None:
1231
+ # Pass spatial_shapes as tuple to make it hashable and make sure
1232
+ # lru_cache is working for generate_anchors()
1233
+ spatial_shapes_tuple = tuple(spatial_shapes_list)
1234
+ anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple, device=device, dtype=dtype)
1235
+ else:
1236
+ anchors, valid_mask = self.anchors, self.valid_mask
1237
+ anchors, valid_mask = anchors.to(device, dtype), valid_mask.to(device, dtype)
1238
+
1239
+ # use the valid_mask to selectively retain values in the feature map where the mask is `True`
1240
+ memory = valid_mask.to(source_flatten.dtype) * source_flatten
1241
+
1242
+ output_memory = self.enc_output(memory)
1243
+
1244
+ enc_outputs_class = self.enc_score_head(output_memory)
1245
+ enc_outputs_coord_logits = self.enc_bbox_head(output_memory) + anchors
1246
+
1247
+ _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.config.num_queries, dim=1)
1248
+
1249
+ reference_points_unact = enc_outputs_coord_logits.gather(
1250
+ dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_logits.shape[-1])
1251
+ )
1252
+
1253
+ # _get_pred_class_and_mask
1254
+ batch_ind = torch.arange(memory.shape[0], device=output_memory.device).unsqueeze(1)
1255
+ target = output_memory[batch_ind, topk_ind]
1256
+ out_query = self.decoder_norm(target)
1257
+ mask_query_embed = self.mask_query_head(out_query)
1258
+ batch_size, mask_dim, _ = mask_query_embed.shape
1259
+
1260
+ enc_topk_bboxes = F.sigmoid(reference_points_unact)
1261
+
1262
+ enc_topk_logits = enc_outputs_class.gather(
1263
+ dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1])
1264
+ )
1265
+
1266
+ # extract region features
1267
+ if self.config.learn_initial_query:
1268
+ target = self.weight_embedding.tile([batch_size, 1, 1])
1269
+ else:
1270
+ target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]))
1271
+ target = target.detach()
1272
+
1273
+ if denoising_class is not None:
1274
+ target = torch.concat([denoising_class, target], 1)
1275
+
1276
+ if self.mask_enhanced:
1277
+ _, _, mask_h, mask_w = encoder_outputs.mask_feat.shape
1278
+ enc_out_masks = torch.bmm(mask_query_embed, encoder_outputs.mask_feat.flatten(start_dim=2)).reshape(
1279
+ batch_size, mask_dim, mask_h, mask_w
1280
+ )
1281
+ reference_points = mask_to_box_coordinate(enc_out_masks > 0, dtype=reference_points_unact.dtype)
1282
+ reference_points_unact = inverse_sigmoid(reference_points)
1283
+
1284
+ if denoising_bbox_unact is not None:
1285
+ reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1)
1286
+
1287
+ init_reference_points = reference_points_unact.detach()
1288
+
1289
+ # decoder
1290
+ decoder_outputs = self.decoder(
1291
+ inputs_embeds=target,
1292
+ encoder_hidden_states=source_flatten,
1293
+ encoder_attention_mask=attention_mask,
1294
+ reference_points=init_reference_points,
1295
+ spatial_shapes=spatial_shapes,
1296
+ spatial_shapes_list=spatial_shapes_list,
1297
+ level_start_index=level_start_index,
1298
+ order_head=self.decoder_order_head,
1299
+ global_pointer=self.decoder_global_pointer,
1300
+ mask_query_head=self.mask_query_head,
1301
+ norm=self.decoder_norm,
1302
+ mask_feat=encoder_outputs.mask_feat,
1303
+ **kwargs,
1304
+ )
1305
+
1306
+ return PPDocLayoutV3ModelOutput(
1307
+ last_hidden_state=decoder_outputs.last_hidden_state,
1308
+ intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
1309
+ intermediate_logits=decoder_outputs.intermediate_logits,
1310
+ intermediate_reference_points=decoder_outputs.intermediate_reference_points,
1311
+ intermediate_predicted_corners=decoder_outputs.intermediate_predicted_corners,
1312
+ initial_reference_points=decoder_outputs.initial_reference_points,
1313
+ decoder_hidden_states=decoder_outputs.hidden_states,
1314
+ decoder_attentions=decoder_outputs.attentions,
1315
+ cross_attentions=decoder_outputs.cross_attentions,
1316
+ out_order_logits=decoder_outputs.decoder_out_order_logits,
1317
+ out_masks=decoder_outputs.decoder_out_masks,
1318
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
1319
+ encoder_hidden_states=encoder_outputs.hidden_states,
1320
+ encoder_attentions=encoder_outputs.attentions,
1321
+ init_reference_points=init_reference_points,
1322
+ enc_topk_logits=enc_topk_logits,
1323
+ enc_topk_bboxes=enc_topk_bboxes,
1324
+ enc_outputs_class=enc_outputs_class,
1325
+ enc_outputs_coord_logits=enc_outputs_coord_logits,
1326
+ denoising_meta_values=denoising_meta_values,
1327
+ )
1328
+
1329
+
1330
+ @dataclass
1331
+ @auto_docstring
1332
+ class PPDocLayoutV3HybridEncoderOutput(BaseModelOutput):
1333
+ r"""
1334
+ mask_feat (`torch.FloatTensor` of shape `(batch_size, config.num_queries, 200, 200)`):
1335
+ Mask features for each query in the batch.
1336
+ """
1337
+
1338
+ mask_feat: torch.FloatTensor = None
1339
+
1340
+
1341
+ @dataclass
1342
+ @auto_docstring
1343
+ class PPDocLayoutV3ForObjectDetectionOutput(ModelOutput):
1344
+ r"""
1345
+ logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
1346
+ Classification logits (including no-object) for all queries.
1347
+ pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
1348
+ Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
1349
+ values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
1350
+ possible padding). You can use [`~PPDocLayoutV3ImageProcessorFast.post_process_object_detection`] to retrieve the
1351
+ unnormalized (absolute) bounding boxes.
1352
+ order_logits (`tuple` of `torch.FloatTensor` of shape `(batch_size, num_queries, num_queries)`):
1353
+ Order logits of the final layer of the decoder.
1354
+ out_masks (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, height, width)`):
1355
+ Masks of the final layer of the decoder.
1356
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
1357
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
1358
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
1359
+ Stacked intermediate hidden states (output of each layer of the decoder).
1360
+ intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, config.num_labels)`):
1361
+ Stacked intermediate logits (logits of each layer of the decoder).
1362
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
1363
+ Stacked intermediate reference points (reference points of each layer of the decoder).
1364
+ intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
1365
+ Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
1366
+ initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
1367
+ Stacked initial reference points (initial reference points of each layer of the decoder).
1368
+ init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
1369
+ Initial reference points sent through the Transformer decoder.
1370
+ enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
1371
+ Logits of predicted bounding boxes coordinates in the encoder.
1372
+ enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
1373
+ Logits of predicted bounding boxes coordinates in the encoder.
1374
+ enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
1375
+ Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
1376
+ picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
1377
+ foreground and background).
1378
+ enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
1379
+ Logits of predicted bounding boxes coordinates in the first stage.
1380
+ denoising_meta_values (`dict`):
1381
+ Extra dictionary for the denoising related values
1382
+ """
1383
+
1384
+ logits: torch.FloatTensor | None = None
1385
+ pred_boxes: torch.FloatTensor | None = None
1386
+ order_logits: torch.FloatTensor | None = None
1387
+ out_masks: torch.FloatTensor | None = None
1388
+ last_hidden_state: torch.FloatTensor | None = None
1389
+ intermediate_hidden_states: torch.FloatTensor | None = None
1390
+ intermediate_logits: torch.FloatTensor | None = None
1391
+ intermediate_reference_points: torch.FloatTensor | None = None
1392
+ intermediate_predicted_corners: torch.FloatTensor | None = None
1393
+ initial_reference_points: torch.FloatTensor | None = None
1394
+ decoder_hidden_states: tuple[torch.FloatTensor] | None = None
1395
+ decoder_attentions: tuple[torch.FloatTensor] | None = None
1396
+ cross_attentions: tuple[torch.FloatTensor] | None = None
1397
+ encoder_last_hidden_state: torch.FloatTensor | None = None
1398
+ encoder_hidden_states: tuple[torch.FloatTensor] | None = None
1399
+ encoder_attentions: tuple[torch.FloatTensor] | None = None
1400
+ init_reference_points: tuple[torch.FloatTensor] | None = None
1401
+ enc_topk_logits: torch.FloatTensor | None = None
1402
+ enc_topk_bboxes: torch.FloatTensor | None = None
1403
+ enc_outputs_class: torch.FloatTensor | None = None
1404
+ enc_outputs_coord_logits: torch.FloatTensor | None = None
1405
+ denoising_meta_values: dict | None = None
1406
+
1407
+
1408
+ @auto_docstring(
1409
+ custom_intro="""
1410
+ PP-DocLayoutV3 Model (consisting of a backbone and encoder-decoder) outputs bounding boxes and logits sorted according to reading order,
1411
+ which are further decoded into scores and classes.
1412
+ """
1413
+ )
1414
+ class PPDocLayoutV3ForObjectDetection(RTDetrForObjectDetection, PPDocLayoutV3PreTrainedModel):
1415
+ _keys_to_ignore_on_load_missing = ["num_batches_tracked", "rel_pos_y_bias", "rel_pos_x_bias"]
1416
+
1417
+ def __init__(self, config: PPDocLayoutV3Config):
1418
+ super().__init__(config)
1419
+
1420
+ del self.model.decoder.class_embed
1421
+ del self.model.decoder.bbox_embed
1422
+ del num_pred # noqa
1423
+
1424
+ self.model.denoising_class_embed = nn.Embedding(config.num_labels, config.d_model)
1425
+ self.num_queries = config.num_queries
1426
+
1427
+ self.post_init()
1428
+
1429
+ @auto_docstring
1430
+ @can_return_tuple
1431
+ def forward(
1432
+ self,
1433
+ pixel_values: torch.FloatTensor,
1434
+ pixel_mask: torch.LongTensor | None = None,
1435
+ encoder_outputs: torch.FloatTensor | None = None,
1436
+ labels: list[dict] | None = None,
1437
+ **kwargs: Unpack[TransformersKwargs],
1438
+ ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ForObjectDetectionOutput:
1439
+ r"""
1440
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1441
+ Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
1442
+ can choose to directly pass a flattened representation of an image.
1443
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
1444
+ Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
1445
+ embedded representation.
1446
+ labels (`list[Dict]` of len `(batch_size,)`, *optional*):
1447
+ Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
1448
+ following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
1449
+ respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
1450
+ in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
1451
+
1452
+ Examples:
1453
+
1454
+ ```python
1455
+ >>> from transformers import AutoModelForObjectDetection, AutoImageProcessor
1456
+ >>> from PIL import Image
1457
+ >>> import requests
1458
+ >>> import torch
1459
+
1460
+ >>> url = "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/layout_demo.jpg"
1461
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1462
+
1463
+ >>> model_path = "PaddlePaddle/PP-DocLayoutV3_safetensors"
1464
+ >>> image_processor = AutoImageProcessor.from_pretrained(model_path)
1465
+ >>> model = AutoModelForObjectDetection.from_pretrained(model_path)
1466
+
1467
+ >>> # prepare image for the model
1468
+ >>> inputs = image_processor(images=[image], return_tensors="pt")
1469
+
1470
+ >>> # forward pass
1471
+ >>> outputs = model(**inputs)
1472
+
1473
+ >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
1474
+ >>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]))
1475
+
1476
+ >>> # print outputs
1477
+ >>> for result in results:
1478
+ ... for idx, (score, label_id, box) in enumerate(zip(result["scores"], result["labels"], result["boxes"])):
1479
+ ... score, label = score.item(), label_id.item()
1480
+ ... box = [round(i, 2) for i in box.tolist()]
1481
+ ... print(f"Order {idx + 1}: {model.config.id2label[label]}: {score:.2f} {box}")
1482
+ Order 1: text: 0.99 [334.95, 184.78, 897.25, 654.83]
1483
+ Order 2: paragraph_title: 0.97 [337.28, 683.92, 869.16, 798.35]
1484
+ Order 3: text: 0.99 [335.75, 842.82, 892.13, 1454.32]
1485
+ Order 4: text: 0.99 [920.18, 185.28, 1476.38, 464.49]
1486
+ Order 5: text: 0.98 [920.47, 483.68, 1480.63, 765.72]
1487
+ Order 6: text: 0.98 [920.62, 846.8, 1482.09, 1220.67]
1488
+ Order 7: text: 0.97 [920.92, 1239.41, 1469.55, 1378.02]
1489
+ Order 8: footnote: 0.86 [335.03, 1614.68, 1483.33, 1731.73]
1490
+ Order 9: footnote: 0.83 [334.64, 1756.74, 1471.78, 1845.69]
1491
+ Order 10: text: 0.81 [336.8, 1910.52, 661.64, 1939.92]
1492
+ Order 11: footnote: 0.96 [336.24, 2114.42, 1450.14, 2172.12]
1493
+ Order 12: number: 0.88 [106.0, 2257.5, 135.84, 2282.18]
1494
+ Order 13: footer: 0.93 [338.4, 2255.52, 986.15, 2284.37]
1495
+ ```"""
1496
+ outputs = self.model(
1497
+ pixel_values,
1498
+ pixel_mask=pixel_mask,
1499
+ encoder_outputs=encoder_outputs,
1500
+ labels=labels,
1501
+ **kwargs,
1502
+ )
1503
+
1504
+ intermediate_logits = outputs.intermediate_logits
1505
+ intermediate_reference_points = outputs.intermediate_reference_points
1506
+ order_logits = outputs.out_order_logits
1507
+ out_masks = outputs.out_masks
1508
+
1509
+ pred_boxes = intermediate_reference_points[:, -1]
1510
+ logits = intermediate_logits[:, -1]
1511
+ order_logits = order_logits[:, -1]
1512
+ out_masks = out_masks[:, -1]
1513
+
1514
+ if labels is not None:
1515
+ raise ValueError("PPDocLayoutV3ForObjectDetection does not support training")
1516
+
1517
+ return PPDocLayoutV3ForObjectDetectionOutput(
1518
+ logits=logits,
1519
+ pred_boxes=pred_boxes,
1520
+ order_logits=order_logits,
1521
+ out_masks=out_masks,
1522
+ last_hidden_state=outputs.last_hidden_state,
1523
+ intermediate_hidden_states=outputs.intermediate_hidden_states,
1524
+ intermediate_logits=outputs.intermediate_logits,
1525
+ intermediate_reference_points=outputs.intermediate_reference_points,
1526
+ intermediate_predicted_corners=outputs.intermediate_predicted_corners,
1527
+ initial_reference_points=outputs.initial_reference_points,
1528
+ decoder_hidden_states=outputs.decoder_hidden_states,
1529
+ decoder_attentions=outputs.decoder_attentions,
1530
+ cross_attentions=outputs.cross_attentions,
1531
+ encoder_last_hidden_state=outputs.encoder_last_hidden_state,
1532
+ encoder_hidden_states=outputs.encoder_hidden_states,
1533
+ encoder_attentions=outputs.encoder_attentions,
1534
+ init_reference_points=outputs.init_reference_points,
1535
+ enc_topk_logits=outputs.enc_topk_logits,
1536
+ enc_topk_bboxes=outputs.enc_topk_bboxes,
1537
+ enc_outputs_class=outputs.enc_outputs_class,
1538
+ enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
1539
+ denoising_meta_values=outputs.denoising_meta_values,
1540
+ )
1541
+
1542
+
1543
+ __all__ = [
1544
+ "PPDocLayoutV3ForObjectDetection",
1545
+ "PPDocLayoutV3ImageProcessorFast",
1546
+ "PPDocLayoutV3Config",
1547
+ "PPDocLayoutV3Model",
1548
+ "PPDocLayoutV3PreTrainedModel",
1549
+ ]