transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1021) hide show
  1. transformers/__init__.py +4 -11
  2. transformers/activations.py +2 -2
  3. transformers/backbone_utils.py +326 -0
  4. transformers/cache_utils.py +11 -2
  5. transformers/cli/serve.py +11 -8
  6. transformers/configuration_utils.py +1 -69
  7. transformers/conversion_mapping.py +146 -26
  8. transformers/convert_slow_tokenizer.py +6 -4
  9. transformers/core_model_loading.py +207 -118
  10. transformers/dependency_versions_check.py +0 -1
  11. transformers/dependency_versions_table.py +7 -8
  12. transformers/file_utils.py +0 -2
  13. transformers/generation/candidate_generator.py +1 -2
  14. transformers/generation/continuous_batching/cache.py +40 -38
  15. transformers/generation/continuous_batching/cache_manager.py +3 -16
  16. transformers/generation/continuous_batching/continuous_api.py +94 -406
  17. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  18. transformers/generation/continuous_batching/requests.py +54 -17
  19. transformers/generation/continuous_batching/scheduler.py +77 -95
  20. transformers/generation/logits_process.py +10 -5
  21. transformers/generation/stopping_criteria.py +1 -2
  22. transformers/generation/utils.py +75 -95
  23. transformers/image_processing_utils.py +0 -3
  24. transformers/image_processing_utils_fast.py +17 -18
  25. transformers/image_transforms.py +44 -13
  26. transformers/image_utils.py +0 -5
  27. transformers/initialization.py +57 -0
  28. transformers/integrations/__init__.py +10 -24
  29. transformers/integrations/accelerate.py +47 -11
  30. transformers/integrations/deepspeed.py +145 -3
  31. transformers/integrations/executorch.py +2 -6
  32. transformers/integrations/finegrained_fp8.py +142 -7
  33. transformers/integrations/flash_attention.py +2 -7
  34. transformers/integrations/hub_kernels.py +18 -7
  35. transformers/integrations/moe.py +226 -106
  36. transformers/integrations/mxfp4.py +47 -34
  37. transformers/integrations/peft.py +488 -176
  38. transformers/integrations/tensor_parallel.py +641 -581
  39. transformers/masking_utils.py +153 -9
  40. transformers/modeling_flash_attention_utils.py +1 -2
  41. transformers/modeling_utils.py +359 -358
  42. transformers/models/__init__.py +6 -0
  43. transformers/models/afmoe/configuration_afmoe.py +14 -4
  44. transformers/models/afmoe/modeling_afmoe.py +8 -8
  45. transformers/models/afmoe/modular_afmoe.py +7 -7
  46. transformers/models/aimv2/configuration_aimv2.py +2 -7
  47. transformers/models/aimv2/modeling_aimv2.py +26 -24
  48. transformers/models/aimv2/modular_aimv2.py +8 -12
  49. transformers/models/albert/configuration_albert.py +8 -1
  50. transformers/models/albert/modeling_albert.py +3 -3
  51. transformers/models/align/configuration_align.py +8 -5
  52. transformers/models/align/modeling_align.py +22 -24
  53. transformers/models/altclip/configuration_altclip.py +4 -6
  54. transformers/models/altclip/modeling_altclip.py +30 -26
  55. transformers/models/apertus/configuration_apertus.py +5 -7
  56. transformers/models/apertus/modeling_apertus.py +4 -4
  57. transformers/models/apertus/modular_apertus.py +8 -10
  58. transformers/models/arcee/configuration_arcee.py +5 -7
  59. transformers/models/arcee/modeling_arcee.py +4 -4
  60. transformers/models/aria/configuration_aria.py +11 -21
  61. transformers/models/aria/modeling_aria.py +39 -36
  62. transformers/models/aria/modular_aria.py +33 -39
  63. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +3 -3
  64. transformers/models/audioflamingo3/modeling_audioflamingo3.py +39 -30
  65. transformers/models/audioflamingo3/modular_audioflamingo3.py +41 -27
  66. transformers/models/auto/auto_factory.py +8 -6
  67. transformers/models/auto/configuration_auto.py +22 -0
  68. transformers/models/auto/image_processing_auto.py +17 -13
  69. transformers/models/auto/modeling_auto.py +15 -0
  70. transformers/models/auto/processing_auto.py +9 -18
  71. transformers/models/auto/tokenization_auto.py +17 -15
  72. transformers/models/autoformer/modeling_autoformer.py +2 -1
  73. transformers/models/aya_vision/configuration_aya_vision.py +4 -0
  74. transformers/models/aya_vision/modeling_aya_vision.py +29 -62
  75. transformers/models/aya_vision/modular_aya_vision.py +20 -45
  76. transformers/models/bamba/configuration_bamba.py +17 -7
  77. transformers/models/bamba/modeling_bamba.py +23 -55
  78. transformers/models/bamba/modular_bamba.py +19 -54
  79. transformers/models/bark/configuration_bark.py +2 -1
  80. transformers/models/bark/modeling_bark.py +24 -10
  81. transformers/models/bart/configuration_bart.py +9 -4
  82. transformers/models/bart/modeling_bart.py +9 -12
  83. transformers/models/beit/configuration_beit.py +2 -4
  84. transformers/models/beit/image_processing_beit_fast.py +3 -3
  85. transformers/models/beit/modeling_beit.py +14 -9
  86. transformers/models/bert/configuration_bert.py +12 -1
  87. transformers/models/bert/modeling_bert.py +6 -30
  88. transformers/models/bert_generation/configuration_bert_generation.py +17 -1
  89. transformers/models/bert_generation/modeling_bert_generation.py +6 -6
  90. transformers/models/big_bird/configuration_big_bird.py +12 -8
  91. transformers/models/big_bird/modeling_big_bird.py +0 -15
  92. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -8
  93. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +9 -7
  94. transformers/models/biogpt/configuration_biogpt.py +8 -1
  95. transformers/models/biogpt/modeling_biogpt.py +4 -8
  96. transformers/models/biogpt/modular_biogpt.py +1 -5
  97. transformers/models/bit/configuration_bit.py +2 -4
  98. transformers/models/bit/modeling_bit.py +6 -5
  99. transformers/models/bitnet/configuration_bitnet.py +5 -7
  100. transformers/models/bitnet/modeling_bitnet.py +3 -4
  101. transformers/models/bitnet/modular_bitnet.py +3 -4
  102. transformers/models/blenderbot/configuration_blenderbot.py +8 -4
  103. transformers/models/blenderbot/modeling_blenderbot.py +4 -4
  104. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -4
  105. transformers/models/blenderbot_small/modeling_blenderbot_small.py +4 -4
  106. transformers/models/blip/configuration_blip.py +9 -9
  107. transformers/models/blip/modeling_blip.py +55 -37
  108. transformers/models/blip_2/configuration_blip_2.py +2 -1
  109. transformers/models/blip_2/modeling_blip_2.py +81 -56
  110. transformers/models/bloom/configuration_bloom.py +5 -1
  111. transformers/models/bloom/modeling_bloom.py +2 -1
  112. transformers/models/blt/configuration_blt.py +23 -12
  113. transformers/models/blt/modeling_blt.py +20 -14
  114. transformers/models/blt/modular_blt.py +70 -10
  115. transformers/models/bridgetower/configuration_bridgetower.py +7 -1
  116. transformers/models/bridgetower/image_processing_bridgetower_fast.py +6 -6
  117. transformers/models/bridgetower/modeling_bridgetower.py +29 -15
  118. transformers/models/bros/configuration_bros.py +24 -17
  119. transformers/models/camembert/configuration_camembert.py +8 -1
  120. transformers/models/camembert/modeling_camembert.py +6 -6
  121. transformers/models/canine/configuration_canine.py +4 -1
  122. transformers/models/chameleon/configuration_chameleon.py +5 -7
  123. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -5
  124. transformers/models/chameleon/modeling_chameleon.py +82 -36
  125. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -7
  126. transformers/models/chinese_clip/modeling_chinese_clip.py +28 -29
  127. transformers/models/clap/configuration_clap.py +4 -8
  128. transformers/models/clap/modeling_clap.py +21 -22
  129. transformers/models/clip/configuration_clip.py +4 -1
  130. transformers/models/clip/image_processing_clip_fast.py +9 -0
  131. transformers/models/clip/modeling_clip.py +25 -22
  132. transformers/models/clipseg/configuration_clipseg.py +4 -1
  133. transformers/models/clipseg/modeling_clipseg.py +27 -25
  134. transformers/models/clipseg/processing_clipseg.py +11 -3
  135. transformers/models/clvp/configuration_clvp.py +14 -2
  136. transformers/models/clvp/modeling_clvp.py +19 -30
  137. transformers/models/codegen/configuration_codegen.py +4 -3
  138. transformers/models/codegen/modeling_codegen.py +2 -1
  139. transformers/models/cohere/configuration_cohere.py +5 -7
  140. transformers/models/cohere/modeling_cohere.py +4 -4
  141. transformers/models/cohere/modular_cohere.py +3 -3
  142. transformers/models/cohere2/configuration_cohere2.py +6 -8
  143. transformers/models/cohere2/modeling_cohere2.py +4 -4
  144. transformers/models/cohere2/modular_cohere2.py +9 -11
  145. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  146. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +3 -3
  147. transformers/models/cohere2_vision/modeling_cohere2_vision.py +24 -25
  148. transformers/models/cohere2_vision/modular_cohere2_vision.py +20 -20
  149. transformers/models/colqwen2/modeling_colqwen2.py +7 -6
  150. transformers/models/colqwen2/modular_colqwen2.py +7 -6
  151. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -46
  152. transformers/models/conditional_detr/image_processing_conditional_detr.py +3 -4
  153. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +28 -14
  154. transformers/models/conditional_detr/modeling_conditional_detr.py +794 -942
  155. transformers/models/conditional_detr/modular_conditional_detr.py +901 -3
  156. transformers/models/convbert/configuration_convbert.py +11 -7
  157. transformers/models/convnext/configuration_convnext.py +2 -4
  158. transformers/models/convnext/image_processing_convnext_fast.py +2 -2
  159. transformers/models/convnext/modeling_convnext.py +7 -6
  160. transformers/models/convnextv2/configuration_convnextv2.py +2 -4
  161. transformers/models/convnextv2/modeling_convnextv2.py +7 -6
  162. transformers/models/cpmant/configuration_cpmant.py +4 -0
  163. transformers/models/csm/configuration_csm.py +9 -15
  164. transformers/models/csm/modeling_csm.py +3 -3
  165. transformers/models/ctrl/configuration_ctrl.py +16 -0
  166. transformers/models/ctrl/modeling_ctrl.py +13 -25
  167. transformers/models/cwm/configuration_cwm.py +5 -7
  168. transformers/models/cwm/modeling_cwm.py +4 -4
  169. transformers/models/d_fine/configuration_d_fine.py +10 -56
  170. transformers/models/d_fine/modeling_d_fine.py +728 -868
  171. transformers/models/d_fine/modular_d_fine.py +335 -412
  172. transformers/models/dab_detr/configuration_dab_detr.py +22 -48
  173. transformers/models/dab_detr/modeling_dab_detr.py +11 -7
  174. transformers/models/dac/modeling_dac.py +1 -1
  175. transformers/models/data2vec/configuration_data2vec_audio.py +4 -1
  176. transformers/models/data2vec/configuration_data2vec_text.py +11 -2
  177. transformers/models/data2vec/modeling_data2vec_audio.py +3 -3
  178. transformers/models/data2vec/modeling_data2vec_text.py +6 -6
  179. transformers/models/data2vec/modeling_data2vec_vision.py +4 -2
  180. transformers/models/dbrx/configuration_dbrx.py +11 -3
  181. transformers/models/dbrx/modeling_dbrx.py +6 -6
  182. transformers/models/dbrx/modular_dbrx.py +6 -6
  183. transformers/models/deberta/configuration_deberta.py +6 -0
  184. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -0
  185. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -1
  186. transformers/models/decision_transformer/modeling_decision_transformer.py +3 -3
  187. transformers/models/deepseek_v2/configuration_deepseek_v2.py +7 -10
  188. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -8
  189. transformers/models/deepseek_v2/modular_deepseek_v2.py +8 -10
  190. transformers/models/deepseek_v3/configuration_deepseek_v3.py +7 -10
  191. transformers/models/deepseek_v3/modeling_deepseek_v3.py +7 -7
  192. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -5
  193. transformers/models/deepseek_vl/configuration_deepseek_vl.py +4 -0
  194. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +2 -2
  195. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +5 -5
  196. transformers/models/deepseek_vl/modeling_deepseek_vl.py +17 -12
  197. transformers/models/deepseek_vl/modular_deepseek_vl.py +4 -0
  198. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +4 -0
  199. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +2 -2
  200. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +6 -6
  201. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +68 -24
  202. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +70 -19
  203. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -45
  204. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +25 -11
  205. transformers/models/deformable_detr/modeling_deformable_detr.py +410 -607
  206. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -3
  207. transformers/models/deit/modeling_deit.py +11 -7
  208. transformers/models/depth_anything/configuration_depth_anything.py +12 -42
  209. transformers/models/depth_anything/modeling_depth_anything.py +5 -3
  210. transformers/models/depth_pro/image_processing_depth_pro_fast.py +2 -2
  211. transformers/models/depth_pro/modeling_depth_pro.py +8 -4
  212. transformers/models/detr/configuration_detr.py +18 -49
  213. transformers/models/detr/image_processing_detr_fast.py +11 -11
  214. transformers/models/detr/modeling_detr.py +695 -734
  215. transformers/models/dia/configuration_dia.py +4 -7
  216. transformers/models/dia/generation_dia.py +8 -17
  217. transformers/models/dia/modeling_dia.py +7 -7
  218. transformers/models/dia/modular_dia.py +4 -4
  219. transformers/models/diffllama/configuration_diffllama.py +5 -7
  220. transformers/models/diffllama/modeling_diffllama.py +3 -8
  221. transformers/models/diffllama/modular_diffllama.py +2 -7
  222. transformers/models/dinat/configuration_dinat.py +2 -4
  223. transformers/models/dinat/modeling_dinat.py +7 -6
  224. transformers/models/dinov2/configuration_dinov2.py +2 -4
  225. transformers/models/dinov2/modeling_dinov2.py +9 -8
  226. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +2 -4
  227. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +9 -8
  228. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +6 -7
  229. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +2 -4
  230. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +2 -3
  231. transformers/models/dinov3_vit/configuration_dinov3_vit.py +2 -4
  232. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +2 -2
  233. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -6
  234. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -6
  235. transformers/models/distilbert/configuration_distilbert.py +8 -1
  236. transformers/models/distilbert/modeling_distilbert.py +3 -3
  237. transformers/models/doge/configuration_doge.py +17 -7
  238. transformers/models/doge/modeling_doge.py +4 -4
  239. transformers/models/doge/modular_doge.py +20 -10
  240. transformers/models/donut/image_processing_donut_fast.py +4 -4
  241. transformers/models/dots1/configuration_dots1.py +16 -7
  242. transformers/models/dots1/modeling_dots1.py +4 -4
  243. transformers/models/dpr/configuration_dpr.py +19 -1
  244. transformers/models/dpt/configuration_dpt.py +23 -65
  245. transformers/models/dpt/image_processing_dpt_fast.py +5 -5
  246. transformers/models/dpt/modeling_dpt.py +19 -15
  247. transformers/models/dpt/modular_dpt.py +4 -4
  248. transformers/models/edgetam/configuration_edgetam.py +1 -1
  249. transformers/models/edgetam/modeling_edgetam.py +53 -53
  250. transformers/models/edgetam/modular_edgetam.py +5 -7
  251. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -56
  252. transformers/models/edgetam_video/modular_edgetam_video.py +9 -9
  253. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -3
  254. transformers/models/efficientloftr/modeling_efficientloftr.py +19 -9
  255. transformers/models/efficientnet/image_processing_efficientnet_fast.py +2 -2
  256. transformers/models/electra/configuration_electra.py +13 -2
  257. transformers/models/electra/modeling_electra.py +6 -6
  258. transformers/models/emu3/configuration_emu3.py +12 -10
  259. transformers/models/emu3/modeling_emu3.py +84 -47
  260. transformers/models/emu3/modular_emu3.py +77 -39
  261. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -1
  262. transformers/models/encoder_decoder/modeling_encoder_decoder.py +20 -24
  263. transformers/models/eomt/configuration_eomt.py +12 -13
  264. transformers/models/eomt/image_processing_eomt_fast.py +3 -3
  265. transformers/models/eomt/modeling_eomt.py +3 -3
  266. transformers/models/eomt/modular_eomt.py +17 -17
  267. transformers/models/eomt_dinov3/__init__.py +28 -0
  268. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  269. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  270. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  271. transformers/models/ernie/configuration_ernie.py +24 -2
  272. transformers/models/ernie/modeling_ernie.py +6 -30
  273. transformers/models/ernie4_5/configuration_ernie4_5.py +5 -7
  274. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  275. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +7 -10
  276. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +4 -4
  277. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -6
  278. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +229 -188
  279. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +79 -55
  280. transformers/models/esm/configuration_esm.py +9 -11
  281. transformers/models/esm/modeling_esm.py +3 -3
  282. transformers/models/esm/modeling_esmfold.py +1 -6
  283. transformers/models/esm/openfold_utils/protein.py +2 -3
  284. transformers/models/evolla/configuration_evolla.py +21 -8
  285. transformers/models/evolla/modeling_evolla.py +11 -7
  286. transformers/models/evolla/modular_evolla.py +5 -1
  287. transformers/models/exaone4/configuration_exaone4.py +8 -5
  288. transformers/models/exaone4/modeling_exaone4.py +4 -4
  289. transformers/models/exaone4/modular_exaone4.py +11 -8
  290. transformers/models/exaone_moe/__init__.py +27 -0
  291. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  292. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  293. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  294. transformers/models/falcon/configuration_falcon.py +9 -1
  295. transformers/models/falcon/modeling_falcon.py +3 -8
  296. transformers/models/falcon_h1/configuration_falcon_h1.py +17 -8
  297. transformers/models/falcon_h1/modeling_falcon_h1.py +22 -54
  298. transformers/models/falcon_h1/modular_falcon_h1.py +21 -52
  299. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -1
  300. transformers/models/falcon_mamba/modeling_falcon_mamba.py +18 -26
  301. transformers/models/falcon_mamba/modular_falcon_mamba.py +4 -0
  302. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -1
  303. transformers/models/fast_vlm/modeling_fast_vlm.py +37 -64
  304. transformers/models/fast_vlm/modular_fast_vlm.py +146 -35
  305. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +0 -1
  306. transformers/models/flaubert/configuration_flaubert.py +10 -4
  307. transformers/models/flaubert/modeling_flaubert.py +1 -1
  308. transformers/models/flava/configuration_flava.py +4 -3
  309. transformers/models/flava/image_processing_flava_fast.py +4 -4
  310. transformers/models/flava/modeling_flava.py +36 -28
  311. transformers/models/flex_olmo/configuration_flex_olmo.py +11 -14
  312. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -4
  313. transformers/models/flex_olmo/modular_flex_olmo.py +11 -14
  314. transformers/models/florence2/configuration_florence2.py +4 -0
  315. transformers/models/florence2/modeling_florence2.py +57 -32
  316. transformers/models/florence2/modular_florence2.py +48 -26
  317. transformers/models/fnet/configuration_fnet.py +6 -1
  318. transformers/models/focalnet/configuration_focalnet.py +2 -4
  319. transformers/models/focalnet/modeling_focalnet.py +10 -7
  320. transformers/models/fsmt/configuration_fsmt.py +12 -16
  321. transformers/models/funnel/configuration_funnel.py +8 -0
  322. transformers/models/fuyu/configuration_fuyu.py +5 -8
  323. transformers/models/fuyu/image_processing_fuyu_fast.py +5 -4
  324. transformers/models/fuyu/modeling_fuyu.py +24 -23
  325. transformers/models/gemma/configuration_gemma.py +5 -7
  326. transformers/models/gemma/modeling_gemma.py +4 -4
  327. transformers/models/gemma/modular_gemma.py +5 -7
  328. transformers/models/gemma2/configuration_gemma2.py +5 -7
  329. transformers/models/gemma2/modeling_gemma2.py +4 -4
  330. transformers/models/gemma2/modular_gemma2.py +8 -10
  331. transformers/models/gemma3/configuration_gemma3.py +28 -22
  332. transformers/models/gemma3/image_processing_gemma3_fast.py +2 -2
  333. transformers/models/gemma3/modeling_gemma3.py +37 -33
  334. transformers/models/gemma3/modular_gemma3.py +46 -42
  335. transformers/models/gemma3n/configuration_gemma3n.py +35 -22
  336. transformers/models/gemma3n/modeling_gemma3n.py +86 -58
  337. transformers/models/gemma3n/modular_gemma3n.py +112 -75
  338. transformers/models/git/configuration_git.py +5 -7
  339. transformers/models/git/modeling_git.py +31 -41
  340. transformers/models/glm/configuration_glm.py +7 -9
  341. transformers/models/glm/modeling_glm.py +4 -4
  342. transformers/models/glm4/configuration_glm4.py +7 -9
  343. transformers/models/glm4/modeling_glm4.py +4 -4
  344. transformers/models/glm46v/configuration_glm46v.py +4 -0
  345. transformers/models/glm46v/image_processing_glm46v.py +5 -2
  346. transformers/models/glm46v/image_processing_glm46v_fast.py +2 -2
  347. transformers/models/glm46v/modeling_glm46v.py +91 -46
  348. transformers/models/glm46v/modular_glm46v.py +4 -0
  349. transformers/models/glm4_moe/configuration_glm4_moe.py +17 -7
  350. transformers/models/glm4_moe/modeling_glm4_moe.py +4 -4
  351. transformers/models/glm4_moe/modular_glm4_moe.py +17 -7
  352. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +8 -10
  353. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +7 -7
  354. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +8 -10
  355. transformers/models/glm4v/configuration_glm4v.py +12 -8
  356. transformers/models/glm4v/image_processing_glm4v.py +5 -2
  357. transformers/models/glm4v/image_processing_glm4v_fast.py +2 -2
  358. transformers/models/glm4v/modeling_glm4v.py +120 -63
  359. transformers/models/glm4v/modular_glm4v.py +82 -50
  360. transformers/models/glm4v_moe/configuration_glm4v_moe.py +18 -6
  361. transformers/models/glm4v_moe/modeling_glm4v_moe.py +115 -63
  362. transformers/models/glm4v_moe/modular_glm4v_moe.py +23 -12
  363. transformers/models/glm_image/configuration_glm_image.py +26 -20
  364. transformers/models/glm_image/image_processing_glm_image.py +1 -1
  365. transformers/models/glm_image/image_processing_glm_image_fast.py +5 -7
  366. transformers/models/glm_image/modeling_glm_image.py +337 -236
  367. transformers/models/glm_image/modular_glm_image.py +415 -255
  368. transformers/models/glm_image/processing_glm_image.py +65 -17
  369. transformers/{pipelines/deprecated → models/glm_ocr}/__init__.py +15 -2
  370. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  371. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  372. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  373. transformers/models/glmasr/modeling_glmasr.py +34 -28
  374. transformers/models/glmasr/modular_glmasr.py +23 -11
  375. transformers/models/glpn/image_processing_glpn_fast.py +3 -3
  376. transformers/models/glpn/modeling_glpn.py +4 -2
  377. transformers/models/got_ocr2/configuration_got_ocr2.py +6 -6
  378. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +3 -3
  379. transformers/models/got_ocr2/modeling_got_ocr2.py +31 -37
  380. transformers/models/got_ocr2/modular_got_ocr2.py +30 -19
  381. transformers/models/gpt2/configuration_gpt2.py +13 -1
  382. transformers/models/gpt2/modeling_gpt2.py +5 -5
  383. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -1
  384. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +5 -4
  385. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -1
  386. transformers/models/gpt_neo/modeling_gpt_neo.py +3 -7
  387. transformers/models/gpt_neox/configuration_gpt_neox.py +8 -3
  388. transformers/models/gpt_neox/modeling_gpt_neox.py +4 -4
  389. transformers/models/gpt_neox/modular_gpt_neox.py +4 -4
  390. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +9 -1
  391. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +2 -2
  392. transformers/models/gpt_oss/configuration_gpt_oss.py +10 -6
  393. transformers/models/gpt_oss/modeling_gpt_oss.py +46 -79
  394. transformers/models/gpt_oss/modular_gpt_oss.py +45 -78
  395. transformers/models/gptj/configuration_gptj.py +4 -4
  396. transformers/models/gptj/modeling_gptj.py +3 -7
  397. transformers/models/granite/configuration_granite.py +5 -7
  398. transformers/models/granite/modeling_granite.py +4 -4
  399. transformers/models/granite_speech/modeling_granite_speech.py +63 -37
  400. transformers/models/granitemoe/configuration_granitemoe.py +5 -7
  401. transformers/models/granitemoe/modeling_granitemoe.py +4 -4
  402. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +17 -7
  403. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +22 -54
  404. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +39 -45
  405. transformers/models/granitemoeshared/configuration_granitemoeshared.py +6 -7
  406. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -4
  407. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -45
  408. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +11 -11
  409. transformers/models/grounding_dino/modeling_grounding_dino.py +68 -86
  410. transformers/models/groupvit/configuration_groupvit.py +4 -1
  411. transformers/models/groupvit/modeling_groupvit.py +29 -22
  412. transformers/models/helium/configuration_helium.py +5 -7
  413. transformers/models/helium/modeling_helium.py +4 -4
  414. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -4
  415. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -5
  416. transformers/models/hgnet_v2/modular_hgnet_v2.py +7 -8
  417. transformers/models/hiera/configuration_hiera.py +2 -4
  418. transformers/models/hiera/modeling_hiera.py +11 -8
  419. transformers/models/hubert/configuration_hubert.py +4 -1
  420. transformers/models/hubert/modeling_hubert.py +7 -4
  421. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +5 -7
  422. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +28 -4
  423. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +28 -6
  424. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +6 -8
  425. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +22 -9
  426. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +22 -8
  427. transformers/models/ibert/configuration_ibert.py +4 -1
  428. transformers/models/idefics/configuration_idefics.py +5 -7
  429. transformers/models/idefics/modeling_idefics.py +3 -4
  430. transformers/models/idefics/vision.py +5 -4
  431. transformers/models/idefics2/configuration_idefics2.py +1 -2
  432. transformers/models/idefics2/image_processing_idefics2_fast.py +1 -0
  433. transformers/models/idefics2/modeling_idefics2.py +72 -50
  434. transformers/models/idefics3/configuration_idefics3.py +1 -3
  435. transformers/models/idefics3/image_processing_idefics3_fast.py +29 -3
  436. transformers/models/idefics3/modeling_idefics3.py +63 -40
  437. transformers/models/ijepa/modeling_ijepa.py +3 -3
  438. transformers/models/imagegpt/configuration_imagegpt.py +9 -1
  439. transformers/models/imagegpt/image_processing_imagegpt_fast.py +2 -2
  440. transformers/models/imagegpt/modeling_imagegpt.py +8 -4
  441. transformers/models/informer/modeling_informer.py +3 -3
  442. transformers/models/instructblip/configuration_instructblip.py +2 -1
  443. transformers/models/instructblip/modeling_instructblip.py +65 -39
  444. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -1
  445. transformers/models/instructblipvideo/modeling_instructblipvideo.py +60 -57
  446. transformers/models/instructblipvideo/modular_instructblipvideo.py +43 -32
  447. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +2 -2
  448. transformers/models/internvl/configuration_internvl.py +5 -0
  449. transformers/models/internvl/modeling_internvl.py +35 -55
  450. transformers/models/internvl/modular_internvl.py +26 -38
  451. transformers/models/internvl/video_processing_internvl.py +2 -2
  452. transformers/models/jais2/configuration_jais2.py +5 -7
  453. transformers/models/jais2/modeling_jais2.py +4 -4
  454. transformers/models/jamba/configuration_jamba.py +5 -7
  455. transformers/models/jamba/modeling_jamba.py +4 -4
  456. transformers/models/jamba/modular_jamba.py +3 -3
  457. transformers/models/janus/image_processing_janus.py +2 -2
  458. transformers/models/janus/image_processing_janus_fast.py +8 -8
  459. transformers/models/janus/modeling_janus.py +63 -146
  460. transformers/models/janus/modular_janus.py +62 -20
  461. transformers/models/jetmoe/configuration_jetmoe.py +6 -4
  462. transformers/models/jetmoe/modeling_jetmoe.py +3 -3
  463. transformers/models/jetmoe/modular_jetmoe.py +3 -3
  464. transformers/models/kosmos2/configuration_kosmos2.py +10 -8
  465. transformers/models/kosmos2/modeling_kosmos2.py +56 -34
  466. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -8
  467. transformers/models/kosmos2_5/modeling_kosmos2_5.py +54 -63
  468. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +8 -3
  469. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +44 -40
  470. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +1 -1
  471. transformers/models/lasr/configuration_lasr.py +2 -4
  472. transformers/models/lasr/modeling_lasr.py +3 -3
  473. transformers/models/lasr/modular_lasr.py +3 -3
  474. transformers/models/layoutlm/configuration_layoutlm.py +14 -1
  475. transformers/models/layoutlm/modeling_layoutlm.py +3 -3
  476. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -16
  477. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +2 -2
  478. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -18
  479. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +2 -2
  480. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -16
  481. transformers/models/led/configuration_led.py +7 -8
  482. transformers/models/levit/image_processing_levit_fast.py +4 -4
  483. transformers/models/lfm2/configuration_lfm2.py +5 -7
  484. transformers/models/lfm2/modeling_lfm2.py +4 -4
  485. transformers/models/lfm2/modular_lfm2.py +3 -3
  486. transformers/models/lfm2_moe/configuration_lfm2_moe.py +5 -7
  487. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -4
  488. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  489. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +9 -15
  490. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -28
  491. transformers/models/lfm2_vl/modular_lfm2_vl.py +42 -27
  492. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -3
  493. transformers/models/lightglue/modeling_lightglue.py +3 -3
  494. transformers/models/lightglue/modular_lightglue.py +3 -3
  495. transformers/models/lighton_ocr/modeling_lighton_ocr.py +31 -28
  496. transformers/models/lighton_ocr/modular_lighton_ocr.py +19 -18
  497. transformers/models/lilt/configuration_lilt.py +6 -1
  498. transformers/models/llama/configuration_llama.py +5 -7
  499. transformers/models/llama/modeling_llama.py +4 -4
  500. transformers/models/llama4/configuration_llama4.py +67 -47
  501. transformers/models/llama4/image_processing_llama4_fast.py +3 -3
  502. transformers/models/llama4/modeling_llama4.py +46 -44
  503. transformers/models/llava/configuration_llava.py +10 -0
  504. transformers/models/llava/image_processing_llava_fast.py +3 -3
  505. transformers/models/llava/modeling_llava.py +38 -65
  506. transformers/models/llava_next/configuration_llava_next.py +2 -1
  507. transformers/models/llava_next/image_processing_llava_next_fast.py +6 -6
  508. transformers/models/llava_next/modeling_llava_next.py +61 -60
  509. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -6
  510. transformers/models/llava_next_video/modeling_llava_next_video.py +115 -100
  511. transformers/models/llava_next_video/modular_llava_next_video.py +110 -101
  512. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -6
  513. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +8 -7
  514. transformers/models/llava_onevision/modeling_llava_onevision.py +111 -105
  515. transformers/models/llava_onevision/modular_llava_onevision.py +106 -101
  516. transformers/models/longcat_flash/configuration_longcat_flash.py +7 -10
  517. transformers/models/longcat_flash/modeling_longcat_flash.py +7 -7
  518. transformers/models/longcat_flash/modular_longcat_flash.py +6 -5
  519. transformers/models/longformer/configuration_longformer.py +4 -1
  520. transformers/models/longt5/configuration_longt5.py +9 -6
  521. transformers/models/longt5/modeling_longt5.py +2 -1
  522. transformers/models/luke/configuration_luke.py +8 -1
  523. transformers/models/lw_detr/configuration_lw_detr.py +19 -31
  524. transformers/models/lw_detr/modeling_lw_detr.py +43 -44
  525. transformers/models/lw_detr/modular_lw_detr.py +36 -38
  526. transformers/models/lxmert/configuration_lxmert.py +16 -0
  527. transformers/models/m2m_100/configuration_m2m_100.py +7 -8
  528. transformers/models/m2m_100/modeling_m2m_100.py +3 -3
  529. transformers/models/mamba/configuration_mamba.py +5 -2
  530. transformers/models/mamba/modeling_mamba.py +18 -26
  531. transformers/models/mamba2/configuration_mamba2.py +5 -7
  532. transformers/models/mamba2/modeling_mamba2.py +22 -33
  533. transformers/models/marian/configuration_marian.py +10 -4
  534. transformers/models/marian/modeling_marian.py +4 -4
  535. transformers/models/markuplm/configuration_markuplm.py +4 -6
  536. transformers/models/markuplm/modeling_markuplm.py +3 -3
  537. transformers/models/mask2former/configuration_mask2former.py +12 -47
  538. transformers/models/mask2former/image_processing_mask2former_fast.py +8 -8
  539. transformers/models/mask2former/modeling_mask2former.py +18 -12
  540. transformers/models/maskformer/configuration_maskformer.py +14 -45
  541. transformers/models/maskformer/configuration_maskformer_swin.py +2 -4
  542. transformers/models/maskformer/image_processing_maskformer_fast.py +8 -8
  543. transformers/models/maskformer/modeling_maskformer.py +15 -9
  544. transformers/models/maskformer/modeling_maskformer_swin.py +2 -3
  545. transformers/models/mbart/configuration_mbart.py +9 -4
  546. transformers/models/mbart/modeling_mbart.py +9 -6
  547. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -2
  548. transformers/models/megatron_bert/modeling_megatron_bert.py +0 -15
  549. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  550. transformers/models/metaclip_2/modeling_metaclip_2.py +49 -42
  551. transformers/models/metaclip_2/modular_metaclip_2.py +41 -25
  552. transformers/models/mgp_str/modeling_mgp_str.py +4 -2
  553. transformers/models/mimi/configuration_mimi.py +4 -0
  554. transformers/models/mimi/modeling_mimi.py +40 -36
  555. transformers/models/minimax/configuration_minimax.py +8 -11
  556. transformers/models/minimax/modeling_minimax.py +5 -5
  557. transformers/models/minimax/modular_minimax.py +9 -12
  558. transformers/models/minimax_m2/configuration_minimax_m2.py +8 -31
  559. transformers/models/minimax_m2/modeling_minimax_m2.py +4 -4
  560. transformers/models/minimax_m2/modular_minimax_m2.py +8 -31
  561. transformers/models/ministral/configuration_ministral.py +5 -7
  562. transformers/models/ministral/modeling_ministral.py +4 -4
  563. transformers/models/ministral/modular_ministral.py +5 -8
  564. transformers/models/ministral3/configuration_ministral3.py +4 -4
  565. transformers/models/ministral3/modeling_ministral3.py +4 -4
  566. transformers/models/ministral3/modular_ministral3.py +3 -3
  567. transformers/models/mistral/configuration_mistral.py +5 -7
  568. transformers/models/mistral/modeling_mistral.py +4 -4
  569. transformers/models/mistral/modular_mistral.py +3 -3
  570. transformers/models/mistral3/configuration_mistral3.py +4 -0
  571. transformers/models/mistral3/modeling_mistral3.py +36 -40
  572. transformers/models/mistral3/modular_mistral3.py +31 -32
  573. transformers/models/mixtral/configuration_mixtral.py +8 -11
  574. transformers/models/mixtral/modeling_mixtral.py +4 -4
  575. transformers/models/mlcd/modeling_mlcd.py +7 -5
  576. transformers/models/mlcd/modular_mlcd.py +7 -5
  577. transformers/models/mllama/configuration_mllama.py +5 -7
  578. transformers/models/mllama/image_processing_mllama_fast.py +6 -5
  579. transformers/models/mllama/modeling_mllama.py +19 -19
  580. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -45
  581. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +66 -84
  582. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -45
  583. transformers/models/mobilebert/configuration_mobilebert.py +4 -1
  584. transformers/models/mobilebert/modeling_mobilebert.py +3 -3
  585. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +4 -4
  586. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +4 -2
  587. transformers/models/mobilevit/image_processing_mobilevit_fast.py +4 -4
  588. transformers/models/mobilevit/modeling_mobilevit.py +4 -2
  589. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -2
  590. transformers/models/modernbert/configuration_modernbert.py +46 -21
  591. transformers/models/modernbert/modeling_modernbert.py +146 -899
  592. transformers/models/modernbert/modular_modernbert.py +185 -908
  593. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +21 -13
  594. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -17
  595. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +24 -23
  596. transformers/models/moonshine/configuration_moonshine.py +12 -7
  597. transformers/models/moonshine/modeling_moonshine.py +7 -7
  598. transformers/models/moonshine/modular_moonshine.py +19 -13
  599. transformers/models/moshi/configuration_moshi.py +28 -2
  600. transformers/models/moshi/modeling_moshi.py +4 -9
  601. transformers/models/mpnet/configuration_mpnet.py +6 -1
  602. transformers/models/mpt/configuration_mpt.py +16 -0
  603. transformers/models/mra/configuration_mra.py +8 -1
  604. transformers/models/mt5/configuration_mt5.py +9 -5
  605. transformers/models/mt5/modeling_mt5.py +5 -8
  606. transformers/models/musicgen/configuration_musicgen.py +12 -7
  607. transformers/models/musicgen/modeling_musicgen.py +6 -5
  608. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -7
  609. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -17
  610. transformers/models/mvp/configuration_mvp.py +8 -4
  611. transformers/models/mvp/modeling_mvp.py +6 -4
  612. transformers/models/nanochat/configuration_nanochat.py +5 -7
  613. transformers/models/nanochat/modeling_nanochat.py +4 -4
  614. transformers/models/nanochat/modular_nanochat.py +4 -4
  615. transformers/models/nemotron/configuration_nemotron.py +5 -7
  616. transformers/models/nemotron/modeling_nemotron.py +4 -14
  617. transformers/models/nllb/tokenization_nllb.py +7 -5
  618. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -9
  619. transformers/models/nllb_moe/modeling_nllb_moe.py +3 -3
  620. transformers/models/nougat/image_processing_nougat_fast.py +8 -8
  621. transformers/models/nystromformer/configuration_nystromformer.py +8 -1
  622. transformers/models/olmo/configuration_olmo.py +5 -7
  623. transformers/models/olmo/modeling_olmo.py +4 -4
  624. transformers/models/olmo/modular_olmo.py +3 -3
  625. transformers/models/olmo2/configuration_olmo2.py +9 -11
  626. transformers/models/olmo2/modeling_olmo2.py +4 -4
  627. transformers/models/olmo2/modular_olmo2.py +7 -7
  628. transformers/models/olmo3/configuration_olmo3.py +10 -11
  629. transformers/models/olmo3/modeling_olmo3.py +4 -4
  630. transformers/models/olmo3/modular_olmo3.py +13 -14
  631. transformers/models/olmoe/configuration_olmoe.py +5 -7
  632. transformers/models/olmoe/modeling_olmoe.py +4 -4
  633. transformers/models/olmoe/modular_olmoe.py +3 -3
  634. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -49
  635. transformers/models/omdet_turbo/modeling_omdet_turbo.py +22 -18
  636. transformers/models/oneformer/configuration_oneformer.py +9 -46
  637. transformers/models/oneformer/image_processing_oneformer_fast.py +8 -8
  638. transformers/models/oneformer/modeling_oneformer.py +14 -9
  639. transformers/models/openai/configuration_openai.py +16 -0
  640. transformers/models/opt/configuration_opt.py +6 -6
  641. transformers/models/opt/modeling_opt.py +5 -5
  642. transformers/models/ovis2/configuration_ovis2.py +4 -0
  643. transformers/models/ovis2/image_processing_ovis2_fast.py +3 -3
  644. transformers/models/ovis2/modeling_ovis2.py +58 -99
  645. transformers/models/ovis2/modular_ovis2.py +52 -13
  646. transformers/models/owlv2/configuration_owlv2.py +4 -1
  647. transformers/models/owlv2/image_processing_owlv2_fast.py +5 -5
  648. transformers/models/owlv2/modeling_owlv2.py +40 -27
  649. transformers/models/owlv2/modular_owlv2.py +5 -5
  650. transformers/models/owlvit/configuration_owlvit.py +4 -1
  651. transformers/models/owlvit/modeling_owlvit.py +40 -27
  652. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +9 -10
  653. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +88 -87
  654. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +82 -53
  655. transformers/models/paligemma/configuration_paligemma.py +4 -0
  656. transformers/models/paligemma/modeling_paligemma.py +30 -26
  657. transformers/models/parakeet/configuration_parakeet.py +2 -4
  658. transformers/models/parakeet/modeling_parakeet.py +3 -3
  659. transformers/models/parakeet/modular_parakeet.py +3 -3
  660. transformers/models/patchtsmixer/modeling_patchtsmixer.py +3 -3
  661. transformers/models/patchtst/modeling_patchtst.py +3 -3
  662. transformers/models/pe_audio/modeling_pe_audio.py +4 -4
  663. transformers/models/pe_audio/modular_pe_audio.py +1 -1
  664. transformers/models/pe_audio_video/modeling_pe_audio_video.py +4 -4
  665. transformers/models/pe_audio_video/modular_pe_audio_video.py +4 -4
  666. transformers/models/pe_video/modeling_pe_video.py +36 -24
  667. transformers/models/pe_video/modular_pe_video.py +36 -23
  668. transformers/models/pegasus/configuration_pegasus.py +8 -5
  669. transformers/models/pegasus/modeling_pegasus.py +4 -4
  670. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -3
  671. transformers/models/pegasus_x/modeling_pegasus_x.py +3 -3
  672. transformers/models/perceiver/image_processing_perceiver_fast.py +2 -2
  673. transformers/models/perceiver/modeling_perceiver.py +17 -9
  674. transformers/models/perception_lm/modeling_perception_lm.py +26 -27
  675. transformers/models/perception_lm/modular_perception_lm.py +27 -25
  676. transformers/models/persimmon/configuration_persimmon.py +5 -7
  677. transformers/models/persimmon/modeling_persimmon.py +5 -5
  678. transformers/models/phi/configuration_phi.py +8 -6
  679. transformers/models/phi/modeling_phi.py +4 -4
  680. transformers/models/phi/modular_phi.py +3 -3
  681. transformers/models/phi3/configuration_phi3.py +9 -11
  682. transformers/models/phi3/modeling_phi3.py +4 -4
  683. transformers/models/phi3/modular_phi3.py +3 -3
  684. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +11 -13
  685. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +4 -4
  686. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +46 -61
  687. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +44 -30
  688. transformers/models/phimoe/configuration_phimoe.py +5 -7
  689. transformers/models/phimoe/modeling_phimoe.py +15 -39
  690. transformers/models/phimoe/modular_phimoe.py +12 -7
  691. transformers/models/pix2struct/configuration_pix2struct.py +12 -9
  692. transformers/models/pix2struct/image_processing_pix2struct_fast.py +5 -5
  693. transformers/models/pix2struct/modeling_pix2struct.py +14 -7
  694. transformers/models/pixio/configuration_pixio.py +2 -4
  695. transformers/models/pixio/modeling_pixio.py +9 -8
  696. transformers/models/pixio/modular_pixio.py +4 -2
  697. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -5
  698. transformers/models/pixtral/modeling_pixtral.py +9 -12
  699. transformers/models/plbart/configuration_plbart.py +8 -5
  700. transformers/models/plbart/modeling_plbart.py +9 -7
  701. transformers/models/plbart/modular_plbart.py +1 -1
  702. transformers/models/poolformer/image_processing_poolformer_fast.py +7 -7
  703. transformers/models/pop2piano/configuration_pop2piano.py +7 -6
  704. transformers/models/pop2piano/modeling_pop2piano.py +2 -1
  705. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  706. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  707. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  708. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  709. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  710. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +12 -46
  711. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +6 -6
  712. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +8 -6
  713. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +12 -10
  714. transformers/models/prophetnet/configuration_prophetnet.py +11 -10
  715. transformers/models/prophetnet/modeling_prophetnet.py +12 -23
  716. transformers/models/pvt/image_processing_pvt.py +7 -7
  717. transformers/models/pvt/image_processing_pvt_fast.py +1 -1
  718. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  719. transformers/models/pvt_v2/modeling_pvt_v2.py +6 -5
  720. transformers/models/qwen2/configuration_qwen2.py +14 -4
  721. transformers/models/qwen2/modeling_qwen2.py +4 -4
  722. transformers/models/qwen2/modular_qwen2.py +3 -3
  723. transformers/models/qwen2/tokenization_qwen2.py +0 -4
  724. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +17 -5
  725. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +108 -88
  726. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +115 -87
  727. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +7 -10
  728. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +98 -53
  729. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +18 -6
  730. transformers/models/qwen2_audio/modeling_qwen2_audio.py +12 -12
  731. transformers/models/qwen2_moe/configuration_qwen2_moe.py +14 -4
  732. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  733. transformers/models/qwen2_moe/modular_qwen2_moe.py +3 -3
  734. transformers/models/qwen2_vl/configuration_qwen2_vl.py +7 -10
  735. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +4 -6
  736. transformers/models/qwen2_vl/modeling_qwen2_vl.py +97 -53
  737. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +4 -6
  738. transformers/models/qwen3/configuration_qwen3.py +15 -5
  739. transformers/models/qwen3/modeling_qwen3.py +4 -4
  740. transformers/models/qwen3/modular_qwen3.py +3 -3
  741. transformers/models/qwen3_moe/configuration_qwen3_moe.py +20 -7
  742. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  743. transformers/models/qwen3_next/configuration_qwen3_next.py +16 -4
  744. transformers/models/qwen3_next/modeling_qwen3_next.py +5 -5
  745. transformers/models/qwen3_next/modular_qwen3_next.py +4 -4
  746. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +55 -19
  747. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +161 -98
  748. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +107 -34
  749. transformers/models/qwen3_vl/configuration_qwen3_vl.py +7 -6
  750. transformers/models/qwen3_vl/modeling_qwen3_vl.py +115 -49
  751. transformers/models/qwen3_vl/modular_qwen3_vl.py +88 -37
  752. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +7 -6
  753. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +173 -99
  754. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +23 -7
  755. transformers/models/rag/configuration_rag.py +6 -6
  756. transformers/models/rag/modeling_rag.py +3 -3
  757. transformers/models/rag/retrieval_rag.py +1 -1
  758. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +8 -6
  759. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +4 -5
  760. transformers/models/reformer/configuration_reformer.py +7 -7
  761. transformers/models/rembert/configuration_rembert.py +8 -1
  762. transformers/models/rembert/modeling_rembert.py +0 -22
  763. transformers/models/resnet/configuration_resnet.py +2 -4
  764. transformers/models/resnet/modeling_resnet.py +6 -5
  765. transformers/models/roberta/configuration_roberta.py +11 -2
  766. transformers/models/roberta/modeling_roberta.py +6 -6
  767. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -2
  768. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +6 -6
  769. transformers/models/roc_bert/configuration_roc_bert.py +8 -1
  770. transformers/models/roc_bert/modeling_roc_bert.py +6 -41
  771. transformers/models/roformer/configuration_roformer.py +13 -2
  772. transformers/models/roformer/modeling_roformer.py +0 -14
  773. transformers/models/rt_detr/configuration_rt_detr.py +8 -49
  774. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -4
  775. transformers/models/rt_detr/image_processing_rt_detr_fast.py +24 -11
  776. transformers/models/rt_detr/modeling_rt_detr.py +578 -737
  777. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -3
  778. transformers/models/rt_detr/modular_rt_detr.py +1508 -6
  779. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -57
  780. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +318 -453
  781. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +25 -66
  782. transformers/models/rwkv/configuration_rwkv.py +2 -3
  783. transformers/models/rwkv/modeling_rwkv.py +0 -23
  784. transformers/models/sam/configuration_sam.py +2 -0
  785. transformers/models/sam/image_processing_sam_fast.py +4 -4
  786. transformers/models/sam/modeling_sam.py +13 -8
  787. transformers/models/sam/processing_sam.py +3 -3
  788. transformers/models/sam2/configuration_sam2.py +1 -1
  789. transformers/models/sam2/modeling_sam2.py +56 -52
  790. transformers/models/sam2/modular_sam2.py +47 -55
  791. transformers/models/sam2_video/modeling_sam2_video.py +50 -51
  792. transformers/models/sam2_video/modular_sam2_video.py +12 -10
  793. transformers/models/sam3/modeling_sam3.py +43 -47
  794. transformers/models/sam3/processing_sam3.py +8 -4
  795. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -2
  796. transformers/models/sam3_tracker/modeling_sam3_tracker.py +50 -49
  797. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  798. transformers/models/sam3_tracker/processing_sam3_tracker.py +0 -1
  799. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +50 -49
  800. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -22
  801. transformers/models/sam3_video/modeling_sam3_video.py +27 -14
  802. transformers/models/sam_hq/configuration_sam_hq.py +2 -0
  803. transformers/models/sam_hq/modeling_sam_hq.py +13 -9
  804. transformers/models/sam_hq/modular_sam_hq.py +6 -6
  805. transformers/models/sam_hq/processing_sam_hq.py +7 -6
  806. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -9
  807. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -9
  808. transformers/models/seed_oss/configuration_seed_oss.py +7 -9
  809. transformers/models/seed_oss/modeling_seed_oss.py +4 -4
  810. transformers/models/seed_oss/modular_seed_oss.py +3 -3
  811. transformers/models/segformer/image_processing_segformer_fast.py +4 -4
  812. transformers/models/segformer/modeling_segformer.py +4 -2
  813. transformers/models/segformer/modular_segformer.py +3 -3
  814. transformers/models/seggpt/modeling_seggpt.py +20 -8
  815. transformers/models/sew/configuration_sew.py +4 -1
  816. transformers/models/sew/modeling_sew.py +9 -5
  817. transformers/models/sew/modular_sew.py +2 -1
  818. transformers/models/sew_d/configuration_sew_d.py +4 -1
  819. transformers/models/sew_d/modeling_sew_d.py +4 -1
  820. transformers/models/shieldgemma2/modeling_shieldgemma2.py +4 -4
  821. transformers/models/siglip/configuration_siglip.py +4 -1
  822. transformers/models/siglip/modeling_siglip.py +27 -71
  823. transformers/models/siglip2/__init__.py +1 -0
  824. transformers/models/siglip2/configuration_siglip2.py +4 -2
  825. transformers/models/siglip2/image_processing_siglip2_fast.py +2 -2
  826. transformers/models/siglip2/modeling_siglip2.py +37 -78
  827. transformers/models/siglip2/modular_siglip2.py +74 -25
  828. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  829. transformers/models/smollm3/configuration_smollm3.py +6 -6
  830. transformers/models/smollm3/modeling_smollm3.py +4 -4
  831. transformers/models/smollm3/modular_smollm3.py +9 -9
  832. transformers/models/smolvlm/configuration_smolvlm.py +1 -3
  833. transformers/models/smolvlm/image_processing_smolvlm_fast.py +29 -3
  834. transformers/models/smolvlm/modeling_smolvlm.py +75 -46
  835. transformers/models/smolvlm/modular_smolvlm.py +36 -23
  836. transformers/models/smolvlm/video_processing_smolvlm.py +9 -9
  837. transformers/models/solar_open/__init__.py +27 -0
  838. transformers/models/solar_open/configuration_solar_open.py +184 -0
  839. transformers/models/solar_open/modeling_solar_open.py +642 -0
  840. transformers/models/solar_open/modular_solar_open.py +224 -0
  841. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +6 -4
  842. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -8
  843. transformers/models/speech_to_text/modeling_speech_to_text.py +3 -3
  844. transformers/models/speecht5/configuration_speecht5.py +7 -8
  845. transformers/models/splinter/configuration_splinter.py +6 -6
  846. transformers/models/splinter/modeling_splinter.py +8 -3
  847. transformers/models/squeezebert/configuration_squeezebert.py +14 -1
  848. transformers/models/stablelm/configuration_stablelm.py +8 -6
  849. transformers/models/stablelm/modeling_stablelm.py +5 -5
  850. transformers/models/starcoder2/configuration_starcoder2.py +11 -5
  851. transformers/models/starcoder2/modeling_starcoder2.py +5 -5
  852. transformers/models/starcoder2/modular_starcoder2.py +4 -4
  853. transformers/models/superglue/configuration_superglue.py +4 -0
  854. transformers/models/superglue/image_processing_superglue_fast.py +4 -3
  855. transformers/models/superglue/modeling_superglue.py +9 -4
  856. transformers/models/superpoint/image_processing_superpoint_fast.py +3 -4
  857. transformers/models/superpoint/modeling_superpoint.py +4 -2
  858. transformers/models/swin/configuration_swin.py +2 -4
  859. transformers/models/swin/modeling_swin.py +11 -8
  860. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -2
  861. transformers/models/swin2sr/modeling_swin2sr.py +4 -2
  862. transformers/models/swinv2/configuration_swinv2.py +2 -4
  863. transformers/models/swinv2/modeling_swinv2.py +10 -7
  864. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -6
  865. transformers/models/switch_transformers/modeling_switch_transformers.py +3 -3
  866. transformers/models/switch_transformers/modular_switch_transformers.py +3 -3
  867. transformers/models/t5/configuration_t5.py +9 -8
  868. transformers/models/t5/modeling_t5.py +5 -8
  869. transformers/models/t5gemma/configuration_t5gemma.py +10 -25
  870. transformers/models/t5gemma/modeling_t5gemma.py +9 -9
  871. transformers/models/t5gemma/modular_t5gemma.py +11 -24
  872. transformers/models/t5gemma2/configuration_t5gemma2.py +35 -48
  873. transformers/models/t5gemma2/modeling_t5gemma2.py +143 -100
  874. transformers/models/t5gemma2/modular_t5gemma2.py +152 -136
  875. transformers/models/table_transformer/configuration_table_transformer.py +18 -49
  876. transformers/models/table_transformer/modeling_table_transformer.py +27 -53
  877. transformers/models/tapas/configuration_tapas.py +12 -1
  878. transformers/models/tapas/modeling_tapas.py +1 -1
  879. transformers/models/tapas/tokenization_tapas.py +1 -0
  880. transformers/models/textnet/configuration_textnet.py +4 -6
  881. transformers/models/textnet/image_processing_textnet_fast.py +3 -3
  882. transformers/models/textnet/modeling_textnet.py +15 -14
  883. transformers/models/time_series_transformer/modeling_time_series_transformer.py +3 -3
  884. transformers/models/timesfm/modeling_timesfm.py +5 -6
  885. transformers/models/timesfm/modular_timesfm.py +5 -6
  886. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -7
  887. transformers/models/timm_backbone/modeling_timm_backbone.py +21 -24
  888. transformers/models/timm_wrapper/modeling_timm_wrapper.py +9 -4
  889. transformers/models/trocr/configuration_trocr.py +11 -7
  890. transformers/models/trocr/modeling_trocr.py +4 -2
  891. transformers/models/tvp/configuration_tvp.py +10 -35
  892. transformers/models/tvp/image_processing_tvp_fast.py +6 -5
  893. transformers/models/tvp/modeling_tvp.py +1 -1
  894. transformers/models/udop/configuration_udop.py +16 -7
  895. transformers/models/udop/modeling_udop.py +10 -6
  896. transformers/models/umt5/configuration_umt5.py +8 -6
  897. transformers/models/umt5/modeling_umt5.py +7 -3
  898. transformers/models/unispeech/configuration_unispeech.py +4 -1
  899. transformers/models/unispeech/modeling_unispeech.py +7 -4
  900. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -1
  901. transformers/models/unispeech_sat/modeling_unispeech_sat.py +7 -4
  902. transformers/models/upernet/configuration_upernet.py +8 -35
  903. transformers/models/upernet/modeling_upernet.py +1 -1
  904. transformers/models/vaultgemma/configuration_vaultgemma.py +5 -7
  905. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  906. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  907. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +4 -6
  908. transformers/models/video_llama_3/modeling_video_llama_3.py +85 -48
  909. transformers/models/video_llama_3/modular_video_llama_3.py +56 -43
  910. transformers/models/video_llama_3/video_processing_video_llama_3.py +29 -8
  911. transformers/models/video_llava/configuration_video_llava.py +4 -0
  912. transformers/models/video_llava/modeling_video_llava.py +87 -89
  913. transformers/models/videomae/modeling_videomae.py +4 -5
  914. transformers/models/vilt/configuration_vilt.py +4 -1
  915. transformers/models/vilt/image_processing_vilt_fast.py +6 -6
  916. transformers/models/vilt/modeling_vilt.py +27 -12
  917. transformers/models/vipllava/configuration_vipllava.py +4 -0
  918. transformers/models/vipllava/modeling_vipllava.py +57 -31
  919. transformers/models/vipllava/modular_vipllava.py +50 -24
  920. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +10 -6
  921. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +27 -20
  922. transformers/models/visual_bert/configuration_visual_bert.py +6 -1
  923. transformers/models/vit/configuration_vit.py +2 -2
  924. transformers/models/vit/modeling_vit.py +7 -5
  925. transformers/models/vit_mae/modeling_vit_mae.py +11 -7
  926. transformers/models/vit_msn/modeling_vit_msn.py +11 -7
  927. transformers/models/vitdet/configuration_vitdet.py +2 -4
  928. transformers/models/vitdet/modeling_vitdet.py +2 -3
  929. transformers/models/vitmatte/configuration_vitmatte.py +6 -35
  930. transformers/models/vitmatte/image_processing_vitmatte_fast.py +2 -2
  931. transformers/models/vitmatte/modeling_vitmatte.py +1 -1
  932. transformers/models/vitpose/configuration_vitpose.py +6 -43
  933. transformers/models/vitpose/modeling_vitpose.py +5 -3
  934. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -4
  935. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +5 -6
  936. transformers/models/vits/configuration_vits.py +4 -0
  937. transformers/models/vits/modeling_vits.py +9 -7
  938. transformers/models/vivit/modeling_vivit.py +4 -4
  939. transformers/models/vjepa2/modeling_vjepa2.py +9 -9
  940. transformers/models/voxtral/configuration_voxtral.py +0 -1
  941. transformers/models/voxtral/modeling_voxtral.py +25 -24
  942. transformers/models/voxtral/modular_voxtral.py +26 -20
  943. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -1
  944. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -4
  945. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -1
  946. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -1
  947. transformers/models/wavlm/configuration_wavlm.py +4 -1
  948. transformers/models/wavlm/modeling_wavlm.py +4 -1
  949. transformers/models/whisper/configuration_whisper.py +6 -4
  950. transformers/models/whisper/generation_whisper.py +0 -1
  951. transformers/models/whisper/modeling_whisper.py +3 -3
  952. transformers/models/x_clip/configuration_x_clip.py +4 -1
  953. transformers/models/x_clip/modeling_x_clip.py +26 -27
  954. transformers/models/xglm/configuration_xglm.py +9 -7
  955. transformers/models/xlm/configuration_xlm.py +10 -7
  956. transformers/models/xlm/modeling_xlm.py +1 -1
  957. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -2
  958. transformers/models/xlm_roberta/modeling_xlm_roberta.py +6 -6
  959. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -1
  960. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +6 -6
  961. transformers/models/xlnet/configuration_xlnet.py +3 -1
  962. transformers/models/xlstm/configuration_xlstm.py +5 -7
  963. transformers/models/xlstm/modeling_xlstm.py +0 -32
  964. transformers/models/xmod/configuration_xmod.py +11 -2
  965. transformers/models/xmod/modeling_xmod.py +13 -16
  966. transformers/models/yolos/image_processing_yolos_fast.py +25 -28
  967. transformers/models/yolos/modeling_yolos.py +7 -7
  968. transformers/models/yolos/modular_yolos.py +16 -16
  969. transformers/models/yoso/configuration_yoso.py +8 -1
  970. transformers/models/youtu/__init__.py +27 -0
  971. transformers/models/youtu/configuration_youtu.py +194 -0
  972. transformers/models/youtu/modeling_youtu.py +619 -0
  973. transformers/models/youtu/modular_youtu.py +254 -0
  974. transformers/models/zamba/configuration_zamba.py +5 -7
  975. transformers/models/zamba/modeling_zamba.py +25 -56
  976. transformers/models/zamba2/configuration_zamba2.py +8 -13
  977. transformers/models/zamba2/modeling_zamba2.py +53 -78
  978. transformers/models/zamba2/modular_zamba2.py +36 -29
  979. transformers/models/zoedepth/configuration_zoedepth.py +17 -40
  980. transformers/models/zoedepth/image_processing_zoedepth_fast.py +9 -9
  981. transformers/models/zoedepth/modeling_zoedepth.py +5 -3
  982. transformers/pipelines/__init__.py +1 -61
  983. transformers/pipelines/any_to_any.py +1 -1
  984. transformers/pipelines/automatic_speech_recognition.py +0 -2
  985. transformers/pipelines/base.py +1 -1
  986. transformers/pipelines/image_text_to_text.py +1 -1
  987. transformers/pipelines/text_to_audio.py +5 -1
  988. transformers/processing_utils.py +35 -44
  989. transformers/pytorch_utils.py +2 -26
  990. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  991. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  992. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  993. transformers/quantizers/quantizer_mxfp4.py +1 -1
  994. transformers/quantizers/quantizer_torchao.py +0 -16
  995. transformers/safetensors_conversion.py +11 -4
  996. transformers/testing_utils.py +3 -28
  997. transformers/tokenization_mistral_common.py +9 -0
  998. transformers/tokenization_python.py +6 -4
  999. transformers/tokenization_utils_base.py +119 -219
  1000. transformers/tokenization_utils_tokenizers.py +31 -2
  1001. transformers/trainer.py +25 -33
  1002. transformers/trainer_seq2seq.py +1 -1
  1003. transformers/training_args.py +411 -417
  1004. transformers/utils/__init__.py +1 -4
  1005. transformers/utils/auto_docstring.py +15 -18
  1006. transformers/utils/backbone_utils.py +13 -373
  1007. transformers/utils/doc.py +4 -36
  1008. transformers/utils/generic.py +69 -33
  1009. transformers/utils/import_utils.py +72 -75
  1010. transformers/utils/loading_report.py +133 -105
  1011. transformers/utils/quantization_config.py +0 -21
  1012. transformers/video_processing_utils.py +5 -5
  1013. transformers/video_utils.py +3 -1
  1014. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/METADATA +118 -237
  1015. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/RECORD +1019 -994
  1016. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1017. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1018. transformers/pipelines/image_to_text.py +0 -189
  1019. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1020. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1021. {transformers-5.0.0rc3.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -12,25 +12,26 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import math
15
- from typing import Any
16
15
 
17
16
  import torch
17
+ import torch.nn as nn
18
18
  import torch.nn.functional as F
19
- from torch import nn
20
19
 
21
20
  from ... import initialization as init
22
21
  from ...activations import ACT2CLS
22
+ from ...backbone_utils import consolidate_backbone_kwargs_to_config
23
23
  from ...configuration_utils import PreTrainedConfig
24
24
  from ...image_transforms import corners_to_center_format
25
- from ...utils import is_torchdynamo_compiling, logging
26
- from ...utils.backbone_utils import verify_backbone_config_arguments
27
- from ..auto import CONFIG_MAPPING, AutoConfig
25
+ from ...processing_utils import Unpack
26
+ from ...utils import TransformersKwargs, logging, torch_compilable_check
27
+ from ..auto import AutoConfig
28
28
  from ..rt_detr.modeling_rt_detr import (
29
+ RTDetrAIFILayer,
29
30
  RTDetrConvNormLayer,
30
31
  RTDetrDecoder,
31
32
  RTDetrDecoderLayer,
32
33
  RTDetrDecoderOutput,
33
- RTDetrEncoder,
34
+ RTDetrEncoderLayer,
34
35
  RTDetrForObjectDetection,
35
36
  RTDetrFrozenBatchNorm2d,
36
37
  RTDetrHybridEncoder,
@@ -68,20 +69,8 @@ class DFineConfig(PreTrainedConfig):
68
69
  The epsilon used by the batch normalization layers.
69
70
  backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `HGNetV2Config()`):
70
71
  The configuration of the backbone model.
71
- backbone (`str`, *optional*):
72
- Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
73
- will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
74
- is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
75
- use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
76
- Whether to use pretrained weights for the backbone.
77
- use_timm_backbone (`bool`, *optional*, defaults to `False`):
78
- Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
79
- library.
80
72
  freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`):
81
73
  Whether to freeze the batch normalization layers in the backbone.
82
- backbone_kwargs (`dict`, *optional*):
83
- Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
84
- e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
85
74
  encoder_hidden_dim (`int`, *optional*, defaults to 256):
86
75
  Dimension of the layers in hybrid encoder.
87
76
  encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`):
@@ -210,6 +199,8 @@ class DFineConfig(PreTrainedConfig):
210
199
  The method to use for the decoder: `"default"` or `"discrete"`.
211
200
  up (`float`, *optional*, defaults to 0.5):
212
201
  Controls the upper bounds of the Weighting Function.
202
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
203
+ Whether to tie weight embeddings
213
204
  """
214
205
 
215
206
  model_type = "d_fine"
@@ -228,11 +219,7 @@ class DFineConfig(PreTrainedConfig):
228
219
  batch_norm_eps=1e-5,
229
220
  # backbone
230
221
  backbone_config=None,
231
- backbone=None,
232
- use_pretrained_backbone=False,
233
- use_timm_backbone=False,
234
222
  freeze_backbone_batch_norms=True,
235
- backbone_kwargs=None,
236
223
  # encoder HybridEncoder
237
224
  encoder_hidden_dim=256,
238
225
  encoder_in_channels=[512, 1024, 2048],
@@ -294,52 +281,23 @@ class DFineConfig(PreTrainedConfig):
294
281
  decoder_offset_scale=0.5,
295
282
  decoder_method="default",
296
283
  up=0.5,
284
+ tie_word_embeddings=True,
297
285
  **kwargs,
298
286
  ):
299
287
  self.initializer_range = initializer_range
300
288
  self.initializer_bias_prior_prob = initializer_bias_prior_prob
301
289
  self.layer_norm_eps = layer_norm_eps
302
290
  self.batch_norm_eps = batch_norm_eps
303
- # backbone
304
- if backbone_config is None and backbone is None:
305
- logger.info(
306
- "`backbone_config` and `backbone` are `None`. Initializing the config with the default `HGNet-V2` backbone."
307
- )
308
- backbone_model_type = "hgnet_v2"
309
- config_class = CONFIG_MAPPING[backbone_model_type]
310
- # this will map it to HGNetV2Config
311
- # and we would need to create HGNetV2Backbone
312
- backbone_config = config_class(
313
- num_channels=3,
314
- embedding_size=64,
315
- hidden_sizes=[256, 512, 1024, 2048],
316
- depths=[3, 4, 6, 3],
317
- layer_type="bottleneck",
318
- hidden_act="relu",
319
- downsample_in_first_stage=False,
320
- downsample_in_bottleneck=False,
321
- out_features=None,
322
- out_indices=[2, 3, 4],
323
- )
324
- elif isinstance(backbone_config, dict):
325
- backbone_model_type = backbone_config.pop("model_type")
326
- config_class = CONFIG_MAPPING[backbone_model_type]
327
- backbone_config = config_class.from_dict(backbone_config)
328
-
329
- verify_backbone_config_arguments(
330
- use_timm_backbone=use_timm_backbone,
331
- use_pretrained_backbone=use_pretrained_backbone,
332
- backbone=backbone,
291
+
292
+ backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
333
293
  backbone_config=backbone_config,
334
- backbone_kwargs=backbone_kwargs,
294
+ default_config_type="hgnet_v2",
295
+ default_config_kwargs={"out_indices": [2, 3, 4]},
296
+ **kwargs,
335
297
  )
336
298
 
337
299
  self.backbone_config = backbone_config
338
- self.backbone = backbone
339
- self.use_pretrained_backbone = use_pretrained_backbone
340
- self.use_timm_backbone = use_timm_backbone
341
300
  self.freeze_backbone_batch_norms = freeze_backbone_batch_norms
342
- self.backbone_kwargs = backbone_kwargs
343
301
  # encoder
344
302
  self.encoder_hidden_dim = encoder_hidden_dim
345
303
  self.encoder_in_channels = encoder_in_channels
@@ -401,6 +359,7 @@ class DFineConfig(PreTrainedConfig):
401
359
  self.lqe_hidden_dim = lqe_hidden_dim
402
360
  self.lqe_layers = lqe_layers
403
361
  self.up = up
362
+ self.tie_word_embeddings = tie_word_embeddings
404
363
 
405
364
  if isinstance(self.decoder_n_points, list):
406
365
  if len(self.decoder_n_points) != self.num_feature_levels:
@@ -417,6 +376,93 @@ class DFineConfig(PreTrainedConfig):
417
376
  super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
418
377
 
419
378
 
379
+ class DFineDecoderOutput(RTDetrDecoderOutput):
380
+ pass
381
+
382
+
383
+ def weighting_function(max_num_bins: int, up: torch.Tensor, reg_scale: int) -> torch.Tensor:
384
+ """
385
+ Generates the non-uniform Weighting Function W(n) for bounding box regression.
386
+
387
+ Args:
388
+ max_num_bins (int): Max number of the discrete bins.
389
+ up (Tensor): Controls upper bounds of the sequence,
390
+ where maximum offset is ±up * H / W.
391
+ reg_scale (float): Controls the curvature of the Weighting Function.
392
+ Larger values result in flatter weights near the central axis W(max_num_bins/2)=0
393
+ and steeper weights at both ends.
394
+ Returns:
395
+ Tensor: Sequence of Weighting Function.
396
+ """
397
+ upper_bound1 = abs(up[0]) * abs(reg_scale)
398
+ upper_bound2 = abs(up[0]) * abs(reg_scale) * 2
399
+ step = (upper_bound1 + 1) ** (2 / (max_num_bins - 2))
400
+ left_values = [-((step) ** i) + 1 for i in range(max_num_bins // 2 - 1, 0, -1)]
401
+ right_values = [(step) ** i - 1 for i in range(1, max_num_bins // 2)]
402
+ values = [-upper_bound2] + left_values + [torch.zeros_like(up[0][None])] + right_values + [upper_bound2]
403
+ values = torch.cat(values, 0)
404
+ return values
405
+
406
+
407
+ def distance2bbox(points, distance: torch.Tensor, reg_scale: float) -> torch.Tensor:
408
+ """
409
+ Decodes edge-distances into bounding box coordinates.
410
+
411
+ Args:
412
+ points (`torch.Tensor`):
413
+ (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
414
+ distance (`torch.Tensor`):
415
+ (batch_size, num_boxes, 4) or (num_boxes, 4), representing distances from the point to the left, top, right, and bottom boundaries.
416
+ reg_scale (`float`):
417
+ Controls the curvature of the Weighting Function.
418
+ Returns:
419
+ `torch.Tensor`: Bounding boxes in (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
420
+ """
421
+ reg_scale = abs(reg_scale)
422
+ top_left_x = points[..., 0] - (0.5 * reg_scale + distance[..., 0]) * (points[..., 2] / reg_scale)
423
+ top_left_y = points[..., 1] - (0.5 * reg_scale + distance[..., 1]) * (points[..., 3] / reg_scale)
424
+ bottom_right_x = points[..., 0] + (0.5 * reg_scale + distance[..., 2]) * (points[..., 2] / reg_scale)
425
+ bottom_right_y = points[..., 1] + (0.5 * reg_scale + distance[..., 3]) * (points[..., 3] / reg_scale)
426
+
427
+ bboxes = torch.stack([top_left_x, top_left_y, bottom_right_x, bottom_right_y], -1)
428
+
429
+ return corners_to_center_format(bboxes)
430
+
431
+
432
+ class DFineMLP(nn.Module):
433
+ def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act: str = "relu"):
434
+ super().__init__()
435
+ self.num_layers = num_layers
436
+ hidden_dims = [hidden_dim] * (num_layers - 1)
437
+ input_dims = [input_dim] + hidden_dims
438
+ output_dims = hidden_dims + [output_dim]
439
+ self.layers = nn.ModuleList(nn.Linear(in_dim, out_dim) for in_dim, out_dim in zip(input_dims, output_dims))
440
+ self.act = ACT2CLS[act]()
441
+
442
+ def forward(self, stat_features: torch.Tensor) -> torch.Tensor:
443
+ for i, layer in enumerate(self.layers):
444
+ stat_features = self.act(layer(stat_features)) if i < self.num_layers - 1 else layer(stat_features)
445
+ return stat_features
446
+
447
+
448
+ class DFineGate(nn.Module):
449
+ def __init__(self, d_model: int):
450
+ super().__init__()
451
+ self.gate = nn.Linear(2 * d_model, 2 * d_model)
452
+ self.norm = nn.LayerNorm(d_model)
453
+
454
+ def forward(self, second_residual: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor:
455
+ gate_input = torch.cat([second_residual, hidden_states], dim=-1)
456
+ gates = torch.sigmoid(self.gate(gate_input))
457
+ gate1, gate2 = gates.chunk(2, dim=-1)
458
+ hidden_states = self.norm(gate1 * second_residual + gate2 * hidden_states)
459
+ return hidden_states
460
+
461
+
462
+ class DFineFrozenBatchNorm2d(RTDetrFrozenBatchNorm2d):
463
+ pass
464
+
465
+
420
466
  class DFineMultiscaleDeformableAttention(nn.Module):
421
467
  def __init__(self, config: DFineConfig):
422
468
  """
@@ -454,14 +500,15 @@ class DFineMultiscaleDeformableAttention(nn.Module):
454
500
  encoder_hidden_states=None,
455
501
  spatial_shapes=None,
456
502
  spatial_shapes_list=None,
503
+ **kwargs: Unpack[TransformersKwargs],
457
504
  ) -> tuple[torch.Tensor, torch.Tensor]:
458
505
  batch_size, num_queries, _ = hidden_states.shape
459
506
  batch_size, sequence_length, _ = encoder_hidden_states.shape
460
507
 
461
- if not is_torchdynamo_compiling() and (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
462
- raise ValueError(
463
- "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
464
- )
508
+ torch_compilable_check(
509
+ (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == sequence_length,
510
+ "Make sure to align the spatial shapes with the sequence length of the encoder hidden states",
511
+ )
465
512
 
466
513
  # Reshape for multi-head attention
467
514
  value = encoder_hidden_states.reshape(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
@@ -508,18 +555,171 @@ class DFineMultiscaleDeformableAttention(nn.Module):
508
555
  return output, attention_weights
509
556
 
510
557
 
511
- class DFineGate(nn.Module):
512
- def __init__(self, d_model: int):
558
+ class DFineConvNormLayer(RTDetrConvNormLayer):
559
+ def __init__(
560
+ self,
561
+ config: DFineConfig,
562
+ in_channels: int,
563
+ out_channels: int,
564
+ kernel_size: int,
565
+ stride: int,
566
+ groups: int = 1,
567
+ padding: int | None = None,
568
+ activation: str | None = None,
569
+ ):
570
+ super().__init__(config, in_channels, out_channels, kernel_size, stride, padding=None, activation=activation)
571
+ self.conv = nn.Conv2d(
572
+ in_channels,
573
+ out_channels,
574
+ kernel_size,
575
+ stride,
576
+ groups=groups,
577
+ padding=(kernel_size - 1) // 2 if padding is None else padding,
578
+ bias=False,
579
+ )
580
+
581
+
582
+ class DFineRepVggBlock(RTDetrRepVggBlock):
583
+ def __init__(self, config: DFineConfig, in_channels: int, out_channels: int):
584
+ super().__init__(config)
585
+ hidden_channels = in_channels
586
+ self.conv1 = DFineConvNormLayer(config, hidden_channels, out_channels, 3, 1, padding=1)
587
+ self.conv2 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, padding=0)
588
+
589
+
590
+ class DFineCSPRepLayer(nn.Module):
591
+ """
592
+ Cross Stage Partial (CSP) network layer with RepVGG blocks.
593
+ """
594
+
595
+ def __init__(
596
+ self, config: DFineConfig, in_channels: int, out_channels: int, num_blocks: int, expansion: float = 1.0
597
+ ):
513
598
  super().__init__()
514
- self.gate = nn.Linear(2 * d_model, 2 * d_model)
515
- self.norm = nn.LayerNorm(d_model)
599
+ activation = config.activation_function
516
600
 
517
- def forward(self, second_residual: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor:
518
- gate_input = torch.cat([second_residual, hidden_states], dim=-1)
519
- gates = torch.sigmoid(self.gate(gate_input))
520
- gate1, gate2 = gates.chunk(2, dim=-1)
521
- hidden_states = self.norm(gate1 * second_residual + gate2 * hidden_states)
522
- return hidden_states
601
+ hidden_channels = int(out_channels * expansion)
602
+ self.conv1 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
603
+ self.conv2 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
604
+ self.bottlenecks = nn.ModuleList(
605
+ [DFineRepVggBlock(config, hidden_channels, hidden_channels) for _ in range(num_blocks)]
606
+ )
607
+ if hidden_channels != out_channels:
608
+ self.conv3 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation)
609
+ else:
610
+ self.conv3 = nn.Identity()
611
+
612
+ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
613
+ hidden_state_1 = self.conv1(hidden_state)
614
+ for bottleneck in self.bottlenecks:
615
+ hidden_state_1 = bottleneck(hidden_state_1)
616
+ hidden_state_2 = self.conv2(hidden_state)
617
+ hidden_state_3 = self.conv3(hidden_state_1 + hidden_state_2)
618
+ return hidden_state_3
619
+
620
+
621
+ class DFineRepNCSPELAN4(nn.Module):
622
+ def __init__(self, config: DFineConfig, act: str = "silu", numb_blocks: int = 3):
623
+ super().__init__()
624
+ conv1_dim = config.encoder_hidden_dim * 2
625
+ conv2_dim = config.encoder_hidden_dim
626
+ conv3_dim = config.encoder_hidden_dim * 2
627
+ conv4_dim = round(config.hidden_expansion * config.encoder_hidden_dim // 2)
628
+ self.conv_dim = conv3_dim // 2
629
+ self.conv1 = DFineConvNormLayer(config, conv1_dim, conv3_dim, 1, 1, activation=act)
630
+ self.csp_rep1 = DFineCSPRepLayer(config, conv3_dim // 2, conv4_dim, num_blocks=numb_blocks)
631
+ self.conv2 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act)
632
+ self.csp_rep2 = DFineCSPRepLayer(config, conv4_dim, conv4_dim, num_blocks=numb_blocks)
633
+ self.conv3 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act)
634
+ self.conv4 = DFineConvNormLayer(config, conv3_dim + (2 * conv4_dim), conv2_dim, 1, 1, activation=act)
635
+
636
+ def forward(self, input_features: torch.Tensor) -> torch.Tensor:
637
+ # Split initial features into two branches after first convolution
638
+ split_features = list(self.conv1(input_features).split((self.conv_dim, self.conv_dim), 1))
639
+
640
+ # Process branches sequentially
641
+ branch1 = self.csp_rep1(split_features[-1])
642
+ branch1 = self.conv2(branch1)
643
+ branch2 = self.csp_rep2(branch1)
644
+ branch2 = self.conv3(branch2)
645
+
646
+ split_features.extend([branch1, branch2])
647
+ merged_features = torch.cat(split_features, 1)
648
+ merged_features = self.conv4(merged_features)
649
+ return merged_features
650
+
651
+
652
+ class DFineSCDown(nn.Module):
653
+ def __init__(self, config: DFineConfig, kernel_size: int, stride: int):
654
+ super().__init__()
655
+ self.conv1 = DFineConvNormLayer(config, config.encoder_hidden_dim, config.encoder_hidden_dim, 1, 1)
656
+ self.conv2 = DFineConvNormLayer(
657
+ config,
658
+ config.encoder_hidden_dim,
659
+ config.encoder_hidden_dim,
660
+ kernel_size,
661
+ stride,
662
+ config.encoder_hidden_dim,
663
+ )
664
+
665
+ def forward(self, input_features: torch.Tensor) -> torch.Tensor:
666
+ input_features = self.conv1(input_features)
667
+ input_features = self.conv2(input_features)
668
+ return input_features
669
+
670
+
671
+ class DFineEncoderLayer(RTDetrEncoderLayer):
672
+ def __init__(self, config: DFineConfig):
673
+ super().__init__(config)
674
+ self.mlp = DFineMLP(
675
+ self.hidden_size, config.encoder_ffn_dim, self.hidden_size, 2, config.encoder_activation_function
676
+ )
677
+
678
+
679
+ class DFineAIFILayer(RTDetrAIFILayer):
680
+ pass
681
+
682
+
683
+ class DFineIntegral(nn.Module):
684
+ """
685
+ A static layer that calculates integral results from a distribution.
686
+
687
+ This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`,
688
+ where Pr(n) is the softmax probability vector representing the discrete
689
+ distribution, and W(n) is the non-uniform Weighting Function.
690
+
691
+ Args:
692
+ max_num_bins (int): Max number of the discrete bins. Default is 32.
693
+ It can be adjusted based on the dataset or task requirements.
694
+ """
695
+
696
+ def __init__(self, config: DFineConfig):
697
+ super().__init__()
698
+ self.max_num_bins = config.max_num_bins
699
+
700
+ def forward(self, pred_corners: torch.Tensor, project: torch.Tensor) -> torch.Tensor:
701
+ batch_size, num_queries, _ = pred_corners.shape
702
+ pred_corners = F.softmax(pred_corners.reshape(-1, self.max_num_bins + 1), dim=1)
703
+ pred_corners = F.linear(pred_corners, project.to(pred_corners.device)).reshape(-1, 4)
704
+ pred_corners = pred_corners.reshape(batch_size, num_queries, -1)
705
+ return pred_corners
706
+
707
+
708
+ class DFineLQE(nn.Module):
709
+ def __init__(self, config: DFineConfig):
710
+ super().__init__()
711
+ self.top_prob_values = config.top_prob_values
712
+ self.max_num_bins = config.max_num_bins
713
+ self.reg_conf = DFineMLP(4 * (self.top_prob_values + 1), config.lqe_hidden_dim, 1, config.lqe_layers)
714
+
715
+ def forward(self, scores: torch.Tensor, pred_corners: torch.Tensor) -> torch.Tensor:
716
+ batch_size, length, _ = pred_corners.size()
717
+ prob = F.softmax(pred_corners.reshape(batch_size, length, 4, self.max_num_bins + 1), dim=-1)
718
+ prob_topk, _ = prob.topk(self.top_prob_values, dim=-1)
719
+ stat = torch.cat([prob_topk, prob_topk.mean(dim=-1, keepdim=True)], dim=-1)
720
+ quality_score = self.reg_conf(stat.reshape(batch_size, length, -1))
721
+ scores = scores + quality_score
722
+ return scores
523
723
 
524
724
 
525
725
  class DFineDecoderLayer(RTDetrDecoderLayer):
@@ -530,6 +730,9 @@ class DFineDecoderLayer(RTDetrDecoderLayer):
530
730
  self.encoder_attn = DFineMultiscaleDeformableAttention(config=config)
531
731
  # gate
532
732
  self.gateway = DFineGate(config.d_model)
733
+ self.mlp = DFineMLP(
734
+ self.hidden_size, config.decoder_ffn_dim, self.hidden_size, 2, config.decoder_activation_function
735
+ )
533
736
 
534
737
  del self.encoder_attn_layer_norm
535
738
 
@@ -542,49 +745,47 @@ class DFineDecoderLayer(RTDetrDecoderLayer):
542
745
  spatial_shapes_list=None,
543
746
  encoder_hidden_states: torch.Tensor | None = None,
544
747
  encoder_attention_mask: torch.Tensor | None = None,
545
- output_attentions: bool | None = False,
546
- ) -> tuple[torch.Tensor, Any, Any]:
748
+ **kwargs: Unpack[TransformersKwargs],
749
+ ) -> torch.Tensor:
750
+ residual = hidden_states
751
+
547
752
  # Self Attention
548
- hidden_states_2, self_attn_weights = self.self_attn(
753
+ hidden_states, _ = self.self_attn(
549
754
  hidden_states=hidden_states,
550
755
  attention_mask=encoder_attention_mask,
551
756
  position_embeddings=position_embeddings,
552
- output_attentions=output_attentions,
757
+ **kwargs,
553
758
  )
554
759
 
555
- hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training)
556
- hidden_states = hidden_states + hidden_states_2
760
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
761
+ hidden_states = residual + hidden_states
557
762
  hidden_states = self.self_attn_layer_norm(hidden_states)
763
+
558
764
  residual = hidden_states
559
765
 
560
766
  # Cross-Attention
561
- cross_attn_weights = None
562
767
  hidden_states = hidden_states if position_embeddings is None else hidden_states + position_embeddings
563
- hidden_states_2, cross_attn_weights = self.encoder_attn(
768
+ hidden_states, _ = self.encoder_attn(
564
769
  hidden_states=hidden_states,
565
770
  encoder_hidden_states=encoder_hidden_states,
566
771
  reference_points=reference_points,
567
772
  spatial_shapes=spatial_shapes,
568
773
  spatial_shapes_list=spatial_shapes_list,
569
774
  )
570
-
571
- hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training)
572
- hidden_states = self.gateway(residual, hidden_states_2)
775
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
776
+ hidden_states = self.gateway(residual, hidden_states)
573
777
 
574
778
  # Fully Connected
575
- hidden_states_2 = self.activation_fn(self.fc1(hidden_states))
576
- hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.activation_dropout, training=self.training)
577
- hidden_states_2 = self.fc2(hidden_states_2)
578
- hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training)
579
- hidden_states = hidden_states + hidden_states_2
779
+ residual = hidden_states
780
+ hidden_states = self.mlp(hidden_states)
781
+ hidden_states = residual + hidden_states
580
782
  hidden_states = self.final_layer_norm(hidden_states.clamp(min=-65504, max=65504))
581
783
 
582
- outputs = (hidden_states,)
784
+ return hidden_states
583
785
 
584
- if output_attentions:
585
- outputs += (self_attn_weights, cross_attn_weights)
586
786
 
587
- return outputs
787
+ class DFineMLPPredictionHead(RTDetrMLPPredictionHead):
788
+ pass
588
789
 
589
790
 
590
791
  class DFinePreTrainedModel(RTDetrPreTrainedModel):
@@ -664,33 +865,42 @@ class DFinePreTrainedModel(RTDetrPreTrainedModel):
664
865
  init.xavier_uniform_(module.denoising_class_embed.weight)
665
866
 
666
867
 
667
- class DFineIntegral(nn.Module):
668
- """
669
- A static layer that calculates integral results from a distribution.
670
-
671
- This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`,
672
- where Pr(n) is the softmax probability vector representing the discrete
673
- distribution, and W(n) is the non-uniform Weighting Function.
674
-
675
- Args:
676
- max_num_bins (int): Max number of the discrete bins. Default is 32.
677
- It can be adjusted based on the dataset or task requirements.
678
- """
679
-
868
+ class DFineHybridEncoder(RTDetrHybridEncoder):
680
869
  def __init__(self, config: DFineConfig):
681
- super().__init__()
682
- self.max_num_bins = config.max_num_bins
870
+ DFinePreTrainedModel.__init__(config)
871
+ self.config = config
872
+ self.in_channels = config.encoder_in_channels
873
+ self.num_fpn_stages = len(self.in_channels) - 1
874
+ self.feat_strides = config.feat_strides
875
+ self.encoder_hidden_dim = config.encoder_hidden_dim
876
+ self.encode_proj_layers = config.encode_proj_layers
877
+ self.positional_encoding_temperature = config.positional_encoding_temperature
878
+ self.eval_size = config.eval_size
879
+ self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels]
880
+ self.out_strides = self.feat_strides
683
881
 
684
- def forward(self, pred_corners: torch.Tensor, project: torch.Tensor) -> torch.Tensor:
685
- batch_size, num_queries, _ = pred_corners.shape
686
- pred_corners = F.softmax(pred_corners.reshape(-1, self.max_num_bins + 1), dim=1)
687
- pred_corners = F.linear(pred_corners, project.to(pred_corners.device)).reshape(-1, 4)
688
- pred_corners = pred_corners.reshape(batch_size, num_queries, -1)
689
- return pred_corners
882
+ # AIFI (Attention-based Intra-scale Feature Interaction) layers
883
+ self.aifi = nn.ModuleList([DFineAIFILayer(config) for _ in range(len(self.encode_proj_layers))])
690
884
 
885
+ # top-down fpn
886
+ self.lateral_convs = nn.ModuleList()
887
+ self.fpn_blocks = nn.ModuleList()
888
+ for _ in range(len(self.in_channels) - 1, 0, -1):
889
+ lateral_layer = DFineConvNormLayer(config, self.encoder_hidden_dim, self.encoder_hidden_dim, 1, 1)
890
+ self.lateral_convs.append(lateral_layer)
891
+ num_blocks = round(3 * config.depth_mult)
892
+ fpn_layer = DFineRepNCSPELAN4(config, numb_blocks=num_blocks)
893
+ self.fpn_blocks.append(fpn_layer)
894
+
895
+ # bottom-up pan
896
+ self.downsample_convs = nn.ModuleList()
897
+ self.pan_blocks = nn.ModuleList()
898
+ for _ in range(len(self.in_channels) - 1):
899
+ self.downsample_convs.append(DFineSCDown(config, 3, 2))
900
+ num_blocks = round(3 * config.depth_mult)
901
+ self.pan_blocks.append(DFineRepNCSPELAN4(config, numb_blocks=num_blocks))
691
902
 
692
- class DFineDecoderOutput(RTDetrDecoderOutput):
693
- pass
903
+ self.post_init()
694
904
 
695
905
 
696
906
  class DFineDecoder(RTDetrDecoder):
@@ -727,26 +937,14 @@ class DFineDecoder(RTDetrDecoder):
727
937
  spatial_shapes,
728
938
  level_start_index=None,
729
939
  spatial_shapes_list=None,
730
- output_hidden_states=None,
731
940
  encoder_attention_mask=None,
732
941
  memory_mask=None,
733
- output_attentions=None,
734
- return_dict=None,
735
- **kwargs,
942
+ **kwargs: Unpack[TransformersKwargs],
736
943
  ) -> DFineDecoderOutput:
737
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
738
- output_hidden_states = (
739
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
740
- )
741
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
742
-
743
944
  if inputs_embeds is not None:
744
945
  hidden_states = inputs_embeds
745
946
 
746
947
  # decoder layers
747
- all_hidden_states = () if output_hidden_states else None
748
- all_self_attns = () if output_attentions else None
749
- all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
750
948
  intermediate = ()
751
949
  intermediate_reference_points = ()
752
950
  intermediate_logits = ()
@@ -762,25 +960,22 @@ class DFineDecoder(RTDetrDecoder):
762
960
  ref_points_input = ref_points_detach.unsqueeze(2)
763
961
  query_pos_embed = self.query_pos_head(ref_points_detach).clamp(min=-10, max=10)
764
962
 
765
- if output_hidden_states:
766
- all_hidden_states += (hidden_states,)
767
-
768
- output = decoder_layer(
769
- hidden_states=hidden_states,
963
+ hidden_states = decoder_layer(
964
+ hidden_states,
770
965
  position_embeddings=query_pos_embed,
771
966
  reference_points=ref_points_input,
772
967
  spatial_shapes=spatial_shapes,
773
968
  spatial_shapes_list=spatial_shapes_list,
774
969
  encoder_hidden_states=encoder_hidden_states,
775
970
  encoder_attention_mask=encoder_attention_mask,
776
- output_attentions=output_attentions,
971
+ **kwargs,
777
972
  )
778
973
 
779
- hidden_states = output[0]
780
-
781
974
  if i == 0:
782
975
  # Initial bounding box predictions with inverse sigmoid refinement
783
- new_reference_points = F.sigmoid(self.pre_bbox_head(output[0]) + inverse_sigmoid(ref_points_detach))
976
+ new_reference_points = F.sigmoid(
977
+ self.pre_bbox_head(hidden_states) + inverse_sigmoid(ref_points_detach)
978
+ )
784
979
  ref_points_initial = new_reference_points.detach()
785
980
 
786
981
  # Refine bounding box corners using FDR, integrating previous layer's corrections
@@ -809,12 +1004,6 @@ class DFineDecoder(RTDetrDecoder):
809
1004
  initial_reference_points += (ref_points_initial,)
810
1005
  intermediate_predicted_corners += (pred_corners,)
811
1006
 
812
- if output_attentions:
813
- all_self_attns += (output[1],)
814
-
815
- if encoder_hidden_states is not None:
816
- all_cross_attentions += (output[2],)
817
-
818
1007
  # Keep batch_size as first dimension
819
1008
  intermediate = torch.stack(intermediate)
820
1009
  if self.class_embed is not None and self.bbox_embed is not None:
@@ -823,27 +1012,6 @@ class DFineDecoder(RTDetrDecoder):
823
1012
  initial_reference_points = torch.stack(initial_reference_points, dim=1)
824
1013
  intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
825
1014
 
826
- # add hidden states from the last decoder layer
827
- if output_hidden_states:
828
- all_hidden_states += (hidden_states,)
829
-
830
- if not return_dict:
831
- return tuple(
832
- v
833
- for v in [
834
- hidden_states,
835
- intermediate,
836
- intermediate_logits,
837
- intermediate_reference_points,
838
- intermediate_predicted_corners,
839
- initial_reference_points,
840
- all_hidden_states,
841
- all_self_attns,
842
- all_cross_attentions,
843
- ]
844
- if v is not None
845
- )
846
-
847
1015
  return DFineDecoderOutput(
848
1016
  last_hidden_state=hidden_states,
849
1017
  intermediate_hidden_states=intermediate,
@@ -851,16 +1019,9 @@ class DFineDecoder(RTDetrDecoder):
851
1019
  intermediate_reference_points=intermediate_reference_points,
852
1020
  intermediate_predicted_corners=intermediate_predicted_corners,
853
1021
  initial_reference_points=initial_reference_points,
854
- hidden_states=all_hidden_states,
855
- attentions=all_self_attns,
856
- cross_attentions=all_cross_attentions,
857
1022
  )
858
1023
 
859
1024
 
860
- class DFineFrozenBatchNorm2d(RTDetrFrozenBatchNorm2d):
861
- pass
862
-
863
-
864
1025
  class DFineModel(RTDetrModel):
865
1026
  def __init__(self, config: DFineConfig):
866
1027
  super().__init__(config)
@@ -892,10 +1053,10 @@ class DFineForObjectDetection(RTDetrForObjectDetection):
892
1053
  # We can't initialize the model on meta device as some weights are modified during the initialization
893
1054
  _no_split_modules = None
894
1055
  _tied_weights_keys = {
895
- r"bbox_embed.(?![0])\d+": "bbox_embed.0",
896
- r"class_embed.(?![0])\d+": "class_embed.0",
897
- "model.decoder.class_embed": "class_embed",
898
- "model.decoder.bbox_embed": "bbox_embed",
1056
+ r"bbox_embed.(?![0])\d+": r"bbox_embed.0",
1057
+ r"class_embed.(?![0])\d+": r"^class_embed.0",
1058
+ "class_embed": "model.decoder.class_embed",
1059
+ "bbox_embed": "model.decoder.bbox_embed",
899
1060
  }
900
1061
 
901
1062
  def __init__(self, config: DFineConfig):
@@ -972,244 +1133,6 @@ class DFineForObjectDetection(RTDetrForObjectDetection):
972
1133
  super().forward(**super_kwargs)
973
1134
 
974
1135
 
975
- def weighting_function(max_num_bins: int, up: torch.Tensor, reg_scale: int) -> torch.Tensor:
976
- """
977
- Generates the non-uniform Weighting Function W(n) for bounding box regression.
978
-
979
- Args:
980
- max_num_bins (int): Max number of the discrete bins.
981
- up (Tensor): Controls upper bounds of the sequence,
982
- where maximum offset is ±up * H / W.
983
- reg_scale (float): Controls the curvature of the Weighting Function.
984
- Larger values result in flatter weights near the central axis W(max_num_bins/2)=0
985
- and steeper weights at both ends.
986
- Returns:
987
- Tensor: Sequence of Weighting Function.
988
- """
989
- upper_bound1 = abs(up[0]) * abs(reg_scale)
990
- upper_bound2 = abs(up[0]) * abs(reg_scale) * 2
991
- step = (upper_bound1 + 1) ** (2 / (max_num_bins - 2))
992
- left_values = [-((step) ** i) + 1 for i in range(max_num_bins // 2 - 1, 0, -1)]
993
- right_values = [(step) ** i - 1 for i in range(1, max_num_bins // 2)]
994
- values = [-upper_bound2] + left_values + [torch.zeros_like(up[0][None])] + right_values + [upper_bound2]
995
- values = torch.cat(values, 0)
996
- return values
997
-
998
-
999
- class DFineMLPPredictionHead(RTDetrMLPPredictionHead):
1000
- pass
1001
-
1002
-
1003
- def distance2bbox(points, distance: torch.Tensor, reg_scale: float) -> torch.Tensor:
1004
- """
1005
- Decodes edge-distances into bounding box coordinates.
1006
-
1007
- Args:
1008
- points (`torch.Tensor`):
1009
- (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
1010
- distance (`torch.Tensor`):
1011
- (batch_size, num_boxes, 4) or (num_boxes, 4), representing distances from the point to the left, top, right, and bottom boundaries.
1012
- reg_scale (`float`):
1013
- Controls the curvature of the Weighting Function.
1014
- Returns:
1015
- `torch.Tensor`: Bounding boxes in (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
1016
- """
1017
- reg_scale = abs(reg_scale)
1018
- top_left_x = points[..., 0] - (0.5 * reg_scale + distance[..., 0]) * (points[..., 2] / reg_scale)
1019
- top_left_y = points[..., 1] - (0.5 * reg_scale + distance[..., 1]) * (points[..., 3] / reg_scale)
1020
- bottom_right_x = points[..., 0] + (0.5 * reg_scale + distance[..., 2]) * (points[..., 2] / reg_scale)
1021
- bottom_right_y = points[..., 1] + (0.5 * reg_scale + distance[..., 3]) * (points[..., 3] / reg_scale)
1022
-
1023
- bboxes = torch.stack([top_left_x, top_left_y, bottom_right_x, bottom_right_y], -1)
1024
-
1025
- return corners_to_center_format(bboxes)
1026
-
1027
-
1028
- class DFineMLP(nn.Module):
1029
- def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act: str = "relu"):
1030
- super().__init__()
1031
- self.num_layers = num_layers
1032
- hidden_dims = [hidden_dim] * (num_layers - 1)
1033
- input_dims = [input_dim] + hidden_dims
1034
- output_dims = hidden_dims + [output_dim]
1035
- self.layers = nn.ModuleList(nn.Linear(in_dim, out_dim) for in_dim, out_dim in zip(input_dims, output_dims))
1036
- self.act = ACT2CLS[act]()
1037
-
1038
- def forward(self, stat_features: torch.Tensor) -> torch.Tensor:
1039
- for i, layer in enumerate(self.layers):
1040
- stat_features = self.act(layer(stat_features)) if i < self.num_layers - 1 else layer(stat_features)
1041
- return stat_features
1042
-
1043
-
1044
- class DFineLQE(nn.Module):
1045
- def __init__(self, config: DFineConfig):
1046
- super().__init__()
1047
- self.top_prob_values = config.top_prob_values
1048
- self.max_num_bins = config.max_num_bins
1049
- self.reg_conf = DFineMLP(4 * (self.top_prob_values + 1), config.lqe_hidden_dim, 1, config.lqe_layers)
1050
-
1051
- def forward(self, scores: torch.Tensor, pred_corners: torch.Tensor) -> torch.Tensor:
1052
- batch_size, length, _ = pred_corners.size()
1053
- prob = F.softmax(pred_corners.reshape(batch_size, length, 4, self.max_num_bins + 1), dim=-1)
1054
- prob_topk, _ = prob.topk(self.top_prob_values, dim=-1)
1055
- stat = torch.cat([prob_topk, prob_topk.mean(dim=-1, keepdim=True)], dim=-1)
1056
- quality_score = self.reg_conf(stat.reshape(batch_size, length, -1))
1057
- scores = scores + quality_score
1058
- return scores
1059
-
1060
-
1061
- class DFineConvNormLayer(RTDetrConvNormLayer):
1062
- def __init__(
1063
- self,
1064
- config: DFineConfig,
1065
- in_channels: int,
1066
- out_channels: int,
1067
- kernel_size: int,
1068
- stride: int,
1069
- groups: int = 1,
1070
- padding: int | None = None,
1071
- activation: str | None = None,
1072
- ):
1073
- super().__init__(config, in_channels, out_channels, kernel_size, stride, padding=None, activation=activation)
1074
- self.conv = nn.Conv2d(
1075
- in_channels,
1076
- out_channels,
1077
- kernel_size,
1078
- stride,
1079
- groups=groups,
1080
- padding=(kernel_size - 1) // 2 if padding is None else padding,
1081
- bias=False,
1082
- )
1083
-
1084
-
1085
- class DFineRepVggBlock(RTDetrRepVggBlock):
1086
- def __init__(self, config: DFineConfig, in_channels: int, out_channels: int):
1087
- super().__init__(config)
1088
- hidden_channels = in_channels
1089
- self.conv1 = DFineConvNormLayer(config, hidden_channels, out_channels, 3, 1, padding=1)
1090
- self.conv2 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, padding=0)
1091
-
1092
-
1093
- class DFineCSPRepLayer(nn.Module):
1094
- """
1095
- Cross Stage Partial (CSP) network layer with RepVGG blocks.
1096
- """
1097
-
1098
- def __init__(
1099
- self, config: DFineConfig, in_channels: int, out_channels: int, num_blocks: int, expansion: float = 1.0
1100
- ):
1101
- super().__init__()
1102
- activation = config.activation_function
1103
-
1104
- hidden_channels = int(out_channels * expansion)
1105
- self.conv1 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
1106
- self.conv2 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
1107
- self.bottlenecks = nn.ModuleList(
1108
- [DFineRepVggBlock(config, hidden_channels, hidden_channels) for _ in range(num_blocks)]
1109
- )
1110
- if hidden_channels != out_channels:
1111
- self.conv3 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation)
1112
- else:
1113
- self.conv3 = nn.Identity()
1114
-
1115
- def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
1116
- hidden_state_1 = self.conv1(hidden_state)
1117
- for bottleneck in self.bottlenecks:
1118
- hidden_state_1 = bottleneck(hidden_state_1)
1119
- hidden_state_2 = self.conv2(hidden_state)
1120
- hidden_state_3 = self.conv3(hidden_state_1 + hidden_state_2)
1121
- return hidden_state_3
1122
-
1123
-
1124
- class DFineRepNCSPELAN4(nn.Module):
1125
- def __init__(self, config: DFineConfig, act: str = "silu", numb_blocks: int = 3):
1126
- super().__init__()
1127
- conv1_dim = config.encoder_hidden_dim * 2
1128
- conv2_dim = config.encoder_hidden_dim
1129
- conv3_dim = config.encoder_hidden_dim * 2
1130
- conv4_dim = round(config.hidden_expansion * config.encoder_hidden_dim // 2)
1131
- self.conv_dim = conv3_dim // 2
1132
- self.conv1 = DFineConvNormLayer(config, conv1_dim, conv3_dim, 1, 1, activation=act)
1133
- self.csp_rep1 = DFineCSPRepLayer(config, conv3_dim // 2, conv4_dim, num_blocks=numb_blocks)
1134
- self.conv2 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act)
1135
- self.csp_rep2 = DFineCSPRepLayer(config, conv4_dim, conv4_dim, num_blocks=numb_blocks)
1136
- self.conv3 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act)
1137
- self.conv4 = DFineConvNormLayer(config, conv3_dim + (2 * conv4_dim), conv2_dim, 1, 1, activation=act)
1138
-
1139
- def forward(self, input_features: torch.Tensor) -> torch.Tensor:
1140
- # Split initial features into two branches after first convolution
1141
- split_features = list(self.conv1(input_features).split((self.conv_dim, self.conv_dim), 1))
1142
-
1143
- # Process branches sequentially
1144
- branch1 = self.csp_rep1(split_features[-1])
1145
- branch1 = self.conv2(branch1)
1146
- branch2 = self.csp_rep2(branch1)
1147
- branch2 = self.conv3(branch2)
1148
-
1149
- split_features.extend([branch1, branch2])
1150
- merged_features = torch.cat(split_features, 1)
1151
- merged_features = self.conv4(merged_features)
1152
- return merged_features
1153
-
1154
-
1155
- class DFineSCDown(nn.Module):
1156
- def __init__(self, config: DFineConfig, kernel_size: int, stride: int):
1157
- super().__init__()
1158
- self.conv1 = DFineConvNormLayer(config, config.encoder_hidden_dim, config.encoder_hidden_dim, 1, 1)
1159
- self.conv2 = DFineConvNormLayer(
1160
- config,
1161
- config.encoder_hidden_dim,
1162
- config.encoder_hidden_dim,
1163
- kernel_size,
1164
- stride,
1165
- config.encoder_hidden_dim,
1166
- )
1167
-
1168
- def forward(self, input_features: torch.Tensor) -> torch.Tensor:
1169
- input_features = self.conv1(input_features)
1170
- input_features = self.conv2(input_features)
1171
- return input_features
1172
-
1173
-
1174
- class DFineEncoder(RTDetrEncoder):
1175
- pass
1176
-
1177
-
1178
- class DFineHybridEncoder(RTDetrHybridEncoder):
1179
- def __init__(self, config: DFineConfig):
1180
- nn.Module.__init__(self)
1181
- self.config = config
1182
- self.in_channels = config.encoder_in_channels
1183
- self.num_fpn_stages = len(self.in_channels) - 1
1184
- self.feat_strides = config.feat_strides
1185
- self.encoder_hidden_dim = config.encoder_hidden_dim
1186
- self.encode_proj_layers = config.encode_proj_layers
1187
- self.positional_encoding_temperature = config.positional_encoding_temperature
1188
- self.eval_size = config.eval_size
1189
- self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels]
1190
- self.out_strides = self.feat_strides
1191
-
1192
- # encoder transformer
1193
- self.encoder = nn.ModuleList([DFineEncoder(config) for _ in range(len(self.encode_proj_layers))])
1194
- # top-down fpn
1195
- self.lateral_convs = nn.ModuleList()
1196
- self.fpn_blocks = nn.ModuleList()
1197
- for _ in range(len(self.in_channels) - 1, 0, -1):
1198
- lateral_layer = DFineConvNormLayer(config, self.encoder_hidden_dim, self.encoder_hidden_dim, 1, 1)
1199
- self.lateral_convs.append(lateral_layer)
1200
- num_blocks = round(3 * config.depth_mult)
1201
- fpn_layer = DFineRepNCSPELAN4(config, numb_blocks=num_blocks)
1202
- self.fpn_blocks.append(fpn_layer)
1203
-
1204
- # bottom-up pan
1205
- self.downsample_convs = nn.ModuleList()
1206
- self.pan_blocks = nn.ModuleList()
1207
- for _ in range(len(self.in_channels) - 1):
1208
- self.downsample_convs.append(DFineSCDown(config, 3, 2))
1209
- num_blocks = round(3 * config.depth_mult)
1210
- self.pan_blocks.append(DFineRepNCSPELAN4(config, numb_blocks=num_blocks))
1211
-
1212
-
1213
1136
  __all__ = [
1214
1137
  "DFineConfig",
1215
1138
  "DFineModel",