transformers 5.0.0rc2__py3-none-any.whl → 5.0.0rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1537) hide show
  1. transformers/__init__.py +9 -28
  2. transformers/audio_utils.py +32 -32
  3. transformers/cache_utils.py +15 -124
  4. transformers/cli/chat.py +3 -3
  5. transformers/cli/serve.py +2 -2
  6. transformers/cli/transformers.py +2 -1
  7. transformers/configuration_utils.py +31 -33
  8. transformers/conversion_mapping.py +5 -1
  9. transformers/convert_slow_tokenizer.py +3 -8
  10. transformers/core_model_loading.py +14 -15
  11. transformers/data/processors/glue.py +0 -1
  12. transformers/data/processors/utils.py +0 -1
  13. transformers/data/processors/xnli.py +0 -1
  14. transformers/dependency_versions_table.py +4 -4
  15. transformers/distributed/configuration_utils.py +1 -2
  16. transformers/dynamic_module_utils.py +23 -23
  17. transformers/feature_extraction_sequence_utils.py +19 -23
  18. transformers/feature_extraction_utils.py +14 -14
  19. transformers/generation/candidate_generator.py +1 -2
  20. transformers/generation/configuration_utils.py +54 -39
  21. transformers/generation/continuous_batching/__init__.py +0 -1
  22. transformers/generation/continuous_batching/cache.py +34 -6
  23. transformers/generation/continuous_batching/cache_manager.py +25 -12
  24. transformers/generation/continuous_batching/continuous_api.py +54 -23
  25. transformers/generation/continuous_batching/requests.py +25 -4
  26. transformers/generation/continuous_batching/scheduler.py +117 -49
  27. transformers/generation/logits_process.py +0 -128
  28. transformers/generation/streamers.py +0 -1
  29. transformers/generation/utils.py +16 -26
  30. transformers/generation/watermarking.py +2 -3
  31. transformers/hf_argparser.py +9 -13
  32. transformers/hyperparameter_search.py +1 -2
  33. transformers/image_processing_base.py +9 -9
  34. transformers/image_processing_utils.py +11 -12
  35. transformers/image_processing_utils_fast.py +53 -53
  36. transformers/image_transforms.py +29 -29
  37. transformers/image_utils.py +30 -32
  38. transformers/integrations/awq.py +1 -3
  39. transformers/integrations/deepspeed.py +1 -1
  40. transformers/integrations/eetq.py +0 -1
  41. transformers/integrations/fbgemm_fp8.py +1 -2
  42. transformers/integrations/finegrained_fp8.py +8 -7
  43. transformers/integrations/flash_attention.py +1 -1
  44. transformers/integrations/flex_attention.py +1 -1
  45. transformers/integrations/fp_quant.py +4 -6
  46. transformers/integrations/ggml.py +0 -1
  47. transformers/integrations/integration_utils.py +2 -3
  48. transformers/integrations/mxfp4.py +5 -6
  49. transformers/integrations/quark.py +2 -4
  50. transformers/integrations/torchao.py +4 -6
  51. transformers/loss/loss_lw_detr.py +356 -0
  52. transformers/loss/loss_utils.py +2 -0
  53. transformers/masking_utils.py +47 -51
  54. transformers/model_debugging_utils.py +4 -5
  55. transformers/modelcard.py +14 -192
  56. transformers/modeling_attn_mask_utils.py +19 -19
  57. transformers/modeling_flash_attention_utils.py +27 -27
  58. transformers/modeling_gguf_pytorch_utils.py +5 -5
  59. transformers/modeling_layers.py +21 -22
  60. transformers/modeling_outputs.py +242 -253
  61. transformers/modeling_rope_utils.py +32 -32
  62. transformers/modeling_utils.py +67 -90
  63. transformers/models/__init__.py +4 -0
  64. transformers/models/afmoe/configuration_afmoe.py +26 -29
  65. transformers/models/afmoe/modeling_afmoe.py +30 -33
  66. transformers/models/afmoe/modular_afmoe.py +16 -18
  67. transformers/models/aimv2/configuration_aimv2.py +2 -5
  68. transformers/models/aimv2/modeling_aimv2.py +20 -21
  69. transformers/models/aimv2/modular_aimv2.py +7 -9
  70. transformers/models/albert/configuration_albert.py +0 -1
  71. transformers/models/albert/modeling_albert.py +67 -69
  72. transformers/models/albert/tokenization_albert.py +1 -4
  73. transformers/models/align/configuration_align.py +0 -1
  74. transformers/models/align/modeling_align.py +61 -62
  75. transformers/models/align/processing_align.py +2 -30
  76. transformers/models/altclip/configuration_altclip.py +0 -1
  77. transformers/models/altclip/modeling_altclip.py +76 -77
  78. transformers/models/altclip/processing_altclip.py +2 -15
  79. transformers/models/apertus/__init__.py +0 -1
  80. transformers/models/apertus/configuration_apertus.py +18 -21
  81. transformers/models/apertus/modeling_apertus.py +31 -34
  82. transformers/models/apertus/modular_apertus.py +28 -30
  83. transformers/models/arcee/configuration_arcee.py +20 -23
  84. transformers/models/arcee/modeling_arcee.py +31 -34
  85. transformers/models/arcee/modular_arcee.py +20 -23
  86. transformers/models/aria/configuration_aria.py +20 -23
  87. transformers/models/aria/image_processing_aria.py +25 -27
  88. transformers/models/aria/modeling_aria.py +63 -66
  89. transformers/models/aria/modular_aria.py +78 -85
  90. transformers/models/aria/processing_aria.py +28 -35
  91. transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +0 -1
  92. transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +3 -6
  93. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +6 -8
  94. transformers/models/audioflamingo3/__init__.py +0 -1
  95. transformers/models/audioflamingo3/configuration_audioflamingo3.py +0 -1
  96. transformers/models/audioflamingo3/modeling_audioflamingo3.py +22 -23
  97. transformers/models/audioflamingo3/modular_audioflamingo3.py +12 -17
  98. transformers/models/audioflamingo3/processing_audioflamingo3.py +6 -8
  99. transformers/models/auto/auto_factory.py +4 -5
  100. transformers/models/auto/configuration_auto.py +26 -5
  101. transformers/models/auto/feature_extraction_auto.py +5 -7
  102. transformers/models/auto/image_processing_auto.py +13 -26
  103. transformers/models/auto/modeling_auto.py +18 -199
  104. transformers/models/auto/processing_auto.py +2 -1
  105. transformers/models/auto/tokenization_auto.py +21 -22
  106. transformers/models/auto/video_processing_auto.py +7 -8
  107. transformers/models/autoformer/configuration_autoformer.py +4 -7
  108. transformers/models/autoformer/modeling_autoformer.py +98 -100
  109. transformers/models/aya_vision/configuration_aya_vision.py +0 -1
  110. transformers/models/aya_vision/modeling_aya_vision.py +35 -37
  111. transformers/models/aya_vision/modular_aya_vision.py +26 -29
  112. transformers/models/aya_vision/processing_aya_vision.py +25 -53
  113. transformers/models/bamba/configuration_bamba.py +29 -32
  114. transformers/models/bamba/modeling_bamba.py +60 -64
  115. transformers/models/bamba/modular_bamba.py +51 -55
  116. transformers/models/bark/configuration_bark.py +4 -7
  117. transformers/models/bark/generation_configuration_bark.py +3 -5
  118. transformers/models/bark/modeling_bark.py +40 -55
  119. transformers/models/bark/processing_bark.py +19 -41
  120. transformers/models/bart/configuration_bart.py +0 -1
  121. transformers/models/bart/modeling_bart.py +115 -117
  122. transformers/models/barthez/tokenization_barthez.py +1 -4
  123. transformers/models/bartpho/tokenization_bartpho.py +6 -7
  124. transformers/models/beit/configuration_beit.py +0 -11
  125. transformers/models/beit/image_processing_beit.py +53 -56
  126. transformers/models/beit/image_processing_beit_fast.py +8 -9
  127. transformers/models/beit/modeling_beit.py +51 -53
  128. transformers/models/bert/configuration_bert.py +0 -1
  129. transformers/models/bert/modeling_bert.py +111 -122
  130. transformers/models/bert/tokenization_bert.py +2 -4
  131. transformers/models/bert/tokenization_bert_legacy.py +3 -5
  132. transformers/models/bert_generation/configuration_bert_generation.py +0 -1
  133. transformers/models/bert_generation/modeling_bert_generation.py +47 -49
  134. transformers/models/bert_generation/tokenization_bert_generation.py +2 -3
  135. transformers/models/bert_japanese/tokenization_bert_japanese.py +5 -6
  136. transformers/models/bertweet/tokenization_bertweet.py +1 -3
  137. transformers/models/big_bird/configuration_big_bird.py +0 -1
  138. transformers/models/big_bird/modeling_big_bird.py +107 -109
  139. transformers/models/big_bird/tokenization_big_bird.py +1 -4
  140. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +0 -1
  141. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +109 -111
  142. transformers/models/biogpt/configuration_biogpt.py +0 -1
  143. transformers/models/biogpt/modeling_biogpt.py +69 -71
  144. transformers/models/biogpt/modular_biogpt.py +59 -61
  145. transformers/models/biogpt/tokenization_biogpt.py +3 -5
  146. transformers/models/bit/configuration_bit.py +0 -1
  147. transformers/models/bit/image_processing_bit.py +21 -24
  148. transformers/models/bit/image_processing_bit_fast.py +0 -1
  149. transformers/models/bit/modeling_bit.py +9 -11
  150. transformers/models/bitnet/configuration_bitnet.py +18 -21
  151. transformers/models/bitnet/modeling_bitnet.py +31 -34
  152. transformers/models/bitnet/modular_bitnet.py +4 -6
  153. transformers/models/blenderbot/configuration_blenderbot.py +0 -1
  154. transformers/models/blenderbot/modeling_blenderbot.py +64 -95
  155. transformers/models/blenderbot/tokenization_blenderbot.py +0 -1
  156. transformers/models/blenderbot_small/configuration_blenderbot_small.py +0 -1
  157. transformers/models/blenderbot_small/modeling_blenderbot_small.py +66 -68
  158. transformers/models/blenderbot_small/tokenization_blenderbot_small.py +1 -3
  159. transformers/models/blip/configuration_blip.py +0 -1
  160. transformers/models/blip/image_processing_blip.py +17 -20
  161. transformers/models/blip/image_processing_blip_fast.py +0 -1
  162. transformers/models/blip/modeling_blip.py +60 -71
  163. transformers/models/blip/modeling_blip_text.py +63 -65
  164. transformers/models/blip/processing_blip.py +5 -36
  165. transformers/models/blip_2/configuration_blip_2.py +0 -1
  166. transformers/models/blip_2/modeling_blip_2.py +70 -71
  167. transformers/models/blip_2/processing_blip_2.py +8 -38
  168. transformers/models/bloom/configuration_bloom.py +0 -1
  169. transformers/models/bloom/modeling_bloom.py +58 -59
  170. transformers/models/blt/configuration_blt.py +71 -74
  171. transformers/models/blt/modeling_blt.py +73 -76
  172. transformers/models/blt/modular_blt.py +57 -59
  173. transformers/models/bridgetower/configuration_bridgetower.py +0 -1
  174. transformers/models/bridgetower/image_processing_bridgetower.py +34 -35
  175. transformers/models/bridgetower/image_processing_bridgetower_fast.py +7 -8
  176. transformers/models/bridgetower/modeling_bridgetower.py +107 -109
  177. transformers/models/bridgetower/processing_bridgetower.py +2 -16
  178. transformers/models/bros/configuration_bros.py +0 -1
  179. transformers/models/bros/modeling_bros.py +78 -80
  180. transformers/models/bros/processing_bros.py +2 -12
  181. transformers/models/byt5/tokenization_byt5.py +4 -6
  182. transformers/models/camembert/configuration_camembert.py +0 -1
  183. transformers/models/camembert/modeling_camembert.py +91 -93
  184. transformers/models/camembert/modular_camembert.py +51 -54
  185. transformers/models/camembert/tokenization_camembert.py +1 -4
  186. transformers/models/canine/configuration_canine.py +0 -1
  187. transformers/models/canine/modeling_canine.py +73 -75
  188. transformers/models/canine/tokenization_canine.py +0 -1
  189. transformers/models/chameleon/configuration_chameleon.py +24 -27
  190. transformers/models/chameleon/image_processing_chameleon.py +21 -24
  191. transformers/models/chameleon/image_processing_chameleon_fast.py +0 -1
  192. transformers/models/chameleon/modeling_chameleon.py +53 -56
  193. transformers/models/chameleon/processing_chameleon.py +16 -41
  194. transformers/models/chinese_clip/configuration_chinese_clip.py +0 -1
  195. transformers/models/chinese_clip/image_processing_chinese_clip.py +21 -24
  196. transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +0 -1
  197. transformers/models/chinese_clip/modeling_chinese_clip.py +65 -66
  198. transformers/models/chinese_clip/processing_chinese_clip.py +2 -15
  199. transformers/models/clap/configuration_clap.py +0 -1
  200. transformers/models/clap/feature_extraction_clap.py +9 -10
  201. transformers/models/clap/modeling_clap.py +88 -89
  202. transformers/models/clap/processing_clap.py +2 -15
  203. transformers/models/clip/configuration_clip.py +0 -1
  204. transformers/models/clip/image_processing_clip.py +21 -24
  205. transformers/models/clip/image_processing_clip_fast.py +0 -1
  206. transformers/models/clip/modeling_clip.py +45 -46
  207. transformers/models/clip/processing_clip.py +2 -14
  208. transformers/models/clip/tokenization_clip.py +2 -5
  209. transformers/models/clipseg/configuration_clipseg.py +0 -1
  210. transformers/models/clipseg/modeling_clipseg.py +86 -87
  211. transformers/models/clipseg/processing_clipseg.py +8 -39
  212. transformers/models/clvp/configuration_clvp.py +1 -3
  213. transformers/models/clvp/feature_extraction_clvp.py +7 -10
  214. transformers/models/clvp/modeling_clvp.py +119 -115
  215. transformers/models/clvp/number_normalizer.py +1 -2
  216. transformers/models/clvp/processing_clvp.py +3 -20
  217. transformers/models/clvp/tokenization_clvp.py +0 -1
  218. transformers/models/code_llama/tokenization_code_llama.py +3 -6
  219. transformers/models/codegen/configuration_codegen.py +0 -1
  220. transformers/models/codegen/modeling_codegen.py +48 -48
  221. transformers/models/codegen/tokenization_codegen.py +5 -6
  222. transformers/models/cohere/configuration_cohere.py +20 -23
  223. transformers/models/cohere/modeling_cohere.py +35 -38
  224. transformers/models/cohere/modular_cohere.py +24 -28
  225. transformers/models/cohere/tokenization_cohere.py +5 -6
  226. transformers/models/cohere2/configuration_cohere2.py +21 -24
  227. transformers/models/cohere2/modeling_cohere2.py +34 -37
  228. transformers/models/cohere2/modular_cohere2.py +39 -41
  229. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +6 -7
  230. transformers/models/cohere2_vision/modeling_cohere2_vision.py +28 -30
  231. transformers/models/cohere2_vision/modular_cohere2_vision.py +21 -23
  232. transformers/models/cohere2_vision/processing_cohere2_vision.py +6 -36
  233. transformers/models/colpali/configuration_colpali.py +0 -1
  234. transformers/models/colpali/modeling_colpali.py +14 -16
  235. transformers/models/colpali/modular_colpali.py +11 -51
  236. transformers/models/colpali/processing_colpali.py +14 -52
  237. transformers/models/colqwen2/modeling_colqwen2.py +20 -22
  238. transformers/models/colqwen2/modular_colqwen2.py +29 -68
  239. transformers/models/colqwen2/processing_colqwen2.py +16 -52
  240. transformers/models/conditional_detr/configuration_conditional_detr.py +0 -1
  241. transformers/models/conditional_detr/image_processing_conditional_detr.py +64 -66
  242. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +22 -22
  243. transformers/models/conditional_detr/modeling_conditional_detr.py +78 -80
  244. transformers/models/conditional_detr/modular_conditional_detr.py +1 -3
  245. transformers/models/convbert/configuration_convbert.py +0 -1
  246. transformers/models/convbert/modeling_convbert.py +85 -87
  247. transformers/models/convbert/tokenization_convbert.py +0 -1
  248. transformers/models/convnext/configuration_convnext.py +0 -1
  249. transformers/models/convnext/image_processing_convnext.py +18 -21
  250. transformers/models/convnext/image_processing_convnext_fast.py +5 -6
  251. transformers/models/convnext/modeling_convnext.py +5 -8
  252. transformers/models/convnextv2/configuration_convnextv2.py +0 -1
  253. transformers/models/convnextv2/modeling_convnextv2.py +5 -8
  254. transformers/models/cpm/tokenization_cpm.py +6 -7
  255. transformers/models/cpm/tokenization_cpm_fast.py +3 -5
  256. transformers/models/cpmant/configuration_cpmant.py +0 -1
  257. transformers/models/cpmant/modeling_cpmant.py +38 -40
  258. transformers/models/cpmant/tokenization_cpmant.py +1 -3
  259. transformers/models/csm/configuration_csm.py +49 -51
  260. transformers/models/csm/generation_csm.py +13 -14
  261. transformers/models/csm/modeling_csm.py +78 -81
  262. transformers/models/csm/modular_csm.py +56 -58
  263. transformers/models/csm/processing_csm.py +25 -68
  264. transformers/models/ctrl/configuration_ctrl.py +0 -1
  265. transformers/models/ctrl/modeling_ctrl.py +38 -41
  266. transformers/models/ctrl/tokenization_ctrl.py +0 -1
  267. transformers/models/cvt/configuration_cvt.py +0 -1
  268. transformers/models/cvt/modeling_cvt.py +13 -15
  269. transformers/models/cwm/__init__.py +0 -1
  270. transformers/models/cwm/configuration_cwm.py +3 -5
  271. transformers/models/cwm/modeling_cwm.py +32 -34
  272. transformers/models/cwm/modular_cwm.py +10 -12
  273. transformers/models/d_fine/configuration_d_fine.py +0 -1
  274. transformers/models/d_fine/modeling_d_fine.py +81 -82
  275. transformers/models/d_fine/modular_d_fine.py +8 -9
  276. transformers/models/dab_detr/configuration_dab_detr.py +0 -1
  277. transformers/models/dab_detr/modeling_dab_detr.py +68 -70
  278. transformers/models/dac/configuration_dac.py +0 -1
  279. transformers/models/dac/feature_extraction_dac.py +6 -9
  280. transformers/models/dac/modeling_dac.py +21 -23
  281. transformers/models/data2vec/configuration_data2vec_audio.py +0 -1
  282. transformers/models/data2vec/configuration_data2vec_text.py +0 -1
  283. transformers/models/data2vec/configuration_data2vec_vision.py +0 -1
  284. transformers/models/data2vec/modeling_data2vec_audio.py +52 -56
  285. transformers/models/data2vec/modeling_data2vec_text.py +91 -93
  286. transformers/models/data2vec/modeling_data2vec_vision.py +41 -42
  287. transformers/models/data2vec/modular_data2vec_audio.py +6 -1
  288. transformers/models/data2vec/modular_data2vec_text.py +51 -54
  289. transformers/models/dbrx/configuration_dbrx.py +18 -19
  290. transformers/models/dbrx/modeling_dbrx.py +39 -42
  291. transformers/models/dbrx/modular_dbrx.py +31 -33
  292. transformers/models/deberta/configuration_deberta.py +0 -1
  293. transformers/models/deberta/modeling_deberta.py +57 -60
  294. transformers/models/deberta/tokenization_deberta.py +2 -5
  295. transformers/models/deberta_v2/configuration_deberta_v2.py +0 -1
  296. transformers/models/deberta_v2/modeling_deberta_v2.py +63 -65
  297. transformers/models/deberta_v2/tokenization_deberta_v2.py +1 -4
  298. transformers/models/decision_transformer/configuration_decision_transformer.py +0 -1
  299. transformers/models/decision_transformer/modeling_decision_transformer.py +48 -50
  300. transformers/models/deepseek_v2/configuration_deepseek_v2.py +34 -37
  301. transformers/models/deepseek_v2/modeling_deepseek_v2.py +32 -33
  302. transformers/models/deepseek_v2/modular_deepseek_v2.py +40 -42
  303. transformers/models/deepseek_v3/configuration_deepseek_v3.py +35 -38
  304. transformers/models/deepseek_v3/modeling_deepseek_v3.py +31 -33
  305. transformers/models/deepseek_v3/modular_deepseek_v3.py +4 -5
  306. transformers/models/deepseek_vl/configuration_deepseek_vl.py +2 -3
  307. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +25 -26
  308. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +7 -6
  309. transformers/models/deepseek_vl/modeling_deepseek_vl.py +31 -31
  310. transformers/models/deepseek_vl/modular_deepseek_vl.py +11 -43
  311. transformers/models/deepseek_vl/processing_deepseek_vl.py +10 -41
  312. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +3 -5
  313. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +35 -35
  314. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +16 -16
  315. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +33 -33
  316. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +71 -90
  317. transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +12 -44
  318. transformers/models/deformable_detr/configuration_deformable_detr.py +0 -1
  319. transformers/models/deformable_detr/image_processing_deformable_detr.py +59 -61
  320. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +17 -17
  321. transformers/models/deformable_detr/modeling_deformable_detr.py +66 -67
  322. transformers/models/deformable_detr/modular_deformable_detr.py +1 -3
  323. transformers/models/deit/configuration_deit.py +0 -1
  324. transformers/models/deit/image_processing_deit.py +18 -21
  325. transformers/models/deit/image_processing_deit_fast.py +0 -1
  326. transformers/models/deit/modeling_deit.py +16 -18
  327. transformers/models/depth_anything/configuration_depth_anything.py +0 -1
  328. transformers/models/depth_anything/modeling_depth_anything.py +5 -8
  329. transformers/models/depth_pro/configuration_depth_pro.py +0 -1
  330. transformers/models/depth_pro/image_processing_depth_pro.py +22 -23
  331. transformers/models/depth_pro/image_processing_depth_pro_fast.py +6 -7
  332. transformers/models/depth_pro/modeling_depth_pro.py +21 -23
  333. transformers/models/detr/configuration_detr.py +0 -1
  334. transformers/models/detr/image_processing_detr.py +64 -66
  335. transformers/models/detr/image_processing_detr_fast.py +22 -23
  336. transformers/models/detr/modeling_detr.py +70 -72
  337. transformers/models/dia/configuration_dia.py +5 -8
  338. transformers/models/dia/feature_extraction_dia.py +6 -9
  339. transformers/models/dia/generation_dia.py +40 -36
  340. transformers/models/dia/modeling_dia.py +61 -64
  341. transformers/models/dia/modular_dia.py +52 -54
  342. transformers/models/dia/processing_dia.py +39 -29
  343. transformers/models/dia/tokenization_dia.py +3 -6
  344. transformers/models/diffllama/configuration_diffllama.py +20 -23
  345. transformers/models/diffllama/modeling_diffllama.py +42 -45
  346. transformers/models/diffllama/modular_diffllama.py +16 -18
  347. transformers/models/dinat/configuration_dinat.py +0 -1
  348. transformers/models/dinat/modeling_dinat.py +40 -42
  349. transformers/models/dinov2/configuration_dinov2.py +0 -1
  350. transformers/models/dinov2/modeling_dinov2.py +11 -13
  351. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +1 -1
  352. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +12 -13
  353. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +5 -7
  354. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +4 -7
  355. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +3 -6
  356. transformers/models/dinov3_vit/configuration_dinov3_vit.py +5 -8
  357. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +5 -6
  358. transformers/models/dinov3_vit/modeling_dinov3_vit.py +14 -16
  359. transformers/models/dinov3_vit/modular_dinov3_vit.py +11 -13
  360. transformers/models/distilbert/configuration_distilbert.py +0 -1
  361. transformers/models/distilbert/modeling_distilbert.py +44 -46
  362. transformers/models/distilbert/tokenization_distilbert.py +0 -1
  363. transformers/models/doge/__init__.py +0 -1
  364. transformers/models/doge/configuration_doge.py +25 -28
  365. transformers/models/doge/modeling_doge.py +42 -45
  366. transformers/models/doge/modular_doge.py +57 -58
  367. transformers/models/donut/configuration_donut_swin.py +0 -1
  368. transformers/models/donut/image_processing_donut.py +26 -29
  369. transformers/models/donut/image_processing_donut_fast.py +5 -10
  370. transformers/models/donut/modeling_donut_swin.py +44 -46
  371. transformers/models/donut/processing_donut.py +5 -26
  372. transformers/models/dots1/configuration_dots1.py +27 -29
  373. transformers/models/dots1/modeling_dots1.py +31 -34
  374. transformers/models/dots1/modular_dots1.py +0 -1
  375. transformers/models/dpr/configuration_dpr.py +0 -1
  376. transformers/models/dpr/modeling_dpr.py +37 -39
  377. transformers/models/dpr/tokenization_dpr.py +7 -9
  378. transformers/models/dpr/tokenization_dpr_fast.py +7 -9
  379. transformers/models/dpt/configuration_dpt.py +0 -1
  380. transformers/models/dpt/image_processing_dpt.py +65 -66
  381. transformers/models/dpt/image_processing_dpt_fast.py +13 -14
  382. transformers/models/dpt/modeling_dpt.py +19 -21
  383. transformers/models/dpt/modular_dpt.py +10 -11
  384. transformers/models/edgetam/configuration_edgetam.py +0 -1
  385. transformers/models/edgetam/modeling_edgetam.py +39 -41
  386. transformers/models/edgetam/modular_edgetam.py +2 -6
  387. transformers/models/edgetam_video/__init__.py +0 -1
  388. transformers/models/edgetam_video/configuration_edgetam_video.py +0 -1
  389. transformers/models/edgetam_video/modeling_edgetam_video.py +76 -77
  390. transformers/models/edgetam_video/modular_edgetam_video.py +16 -18
  391. transformers/models/efficientloftr/configuration_efficientloftr.py +4 -5
  392. transformers/models/efficientloftr/image_processing_efficientloftr.py +14 -16
  393. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +4 -4
  394. transformers/models/efficientloftr/modeling_efficientloftr.py +27 -29
  395. transformers/models/efficientloftr/modular_efficientloftr.py +1 -3
  396. transformers/models/efficientnet/configuration_efficientnet.py +0 -1
  397. transformers/models/efficientnet/image_processing_efficientnet.py +23 -26
  398. transformers/models/efficientnet/image_processing_efficientnet_fast.py +14 -15
  399. transformers/models/efficientnet/modeling_efficientnet.py +12 -14
  400. transformers/models/electra/configuration_electra.py +0 -1
  401. transformers/models/electra/modeling_electra.py +101 -103
  402. transformers/models/emu3/configuration_emu3.py +5 -7
  403. transformers/models/emu3/image_processing_emu3.py +44 -39
  404. transformers/models/emu3/modeling_emu3.py +59 -62
  405. transformers/models/emu3/modular_emu3.py +32 -34
  406. transformers/models/emu3/processing_emu3.py +18 -43
  407. transformers/models/encodec/configuration_encodec.py +2 -4
  408. transformers/models/encodec/feature_extraction_encodec.py +10 -13
  409. transformers/models/encodec/modeling_encodec.py +25 -29
  410. transformers/models/encoder_decoder/configuration_encoder_decoder.py +0 -1
  411. transformers/models/encoder_decoder/modeling_encoder_decoder.py +17 -19
  412. transformers/models/eomt/configuration_eomt.py +0 -1
  413. transformers/models/eomt/image_processing_eomt.py +53 -55
  414. transformers/models/eomt/image_processing_eomt_fast.py +15 -16
  415. transformers/models/eomt/modeling_eomt.py +16 -18
  416. transformers/models/eomt/modular_eomt.py +11 -13
  417. transformers/models/ernie/configuration_ernie.py +0 -1
  418. transformers/models/ernie/modeling_ernie.py +121 -132
  419. transformers/models/ernie/modular_ernie.py +91 -103
  420. transformers/models/ernie4_5/configuration_ernie4_5.py +18 -20
  421. transformers/models/ernie4_5/modeling_ernie4_5.py +31 -33
  422. transformers/models/ernie4_5/modular_ernie4_5.py +1 -3
  423. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +27 -29
  424. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +36 -38
  425. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +7 -9
  426. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +0 -1
  427. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +34 -35
  428. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +6 -7
  429. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +84 -87
  430. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +86 -89
  431. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +3 -5
  432. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +17 -18
  433. transformers/models/esm/configuration_esm.py +2 -4
  434. transformers/models/esm/modeling_esm.py +32 -34
  435. transformers/models/esm/modeling_esmfold.py +42 -44
  436. transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
  437. transformers/models/esm/openfold_utils/loss.py +1 -2
  438. transformers/models/esm/openfold_utils/protein.py +13 -13
  439. transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
  440. transformers/models/esm/tokenization_esm.py +2 -4
  441. transformers/models/evolla/configuration_evolla.py +29 -32
  442. transformers/models/evolla/modeling_evolla.py +58 -61
  443. transformers/models/evolla/modular_evolla.py +45 -47
  444. transformers/models/evolla/processing_evolla.py +23 -35
  445. transformers/models/exaone4/configuration_exaone4.py +19 -22
  446. transformers/models/exaone4/modeling_exaone4.py +32 -35
  447. transformers/models/exaone4/modular_exaone4.py +40 -42
  448. transformers/models/falcon/configuration_falcon.py +22 -25
  449. transformers/models/falcon/modeling_falcon.py +73 -76
  450. transformers/models/falcon_h1/configuration_falcon_h1.py +40 -43
  451. transformers/models/falcon_h1/modeling_falcon_h1.py +52 -55
  452. transformers/models/falcon_h1/modular_falcon_h1.py +47 -48
  453. transformers/models/falcon_mamba/configuration_falcon_mamba.py +0 -1
  454. transformers/models/falcon_mamba/modeling_falcon_mamba.py +46 -47
  455. transformers/models/falcon_mamba/modular_falcon_mamba.py +10 -13
  456. transformers/models/fast_vlm/configuration_fast_vlm.py +1 -0
  457. transformers/models/fast_vlm/modeling_fast_vlm.py +36 -36
  458. transformers/models/fast_vlm/modular_fast_vlm.py +2 -3
  459. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +2 -5
  460. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +45 -47
  461. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -3
  462. transformers/models/flaubert/configuration_flaubert.py +0 -1
  463. transformers/models/flaubert/modeling_flaubert.py +124 -128
  464. transformers/models/flaubert/tokenization_flaubert.py +3 -5
  465. transformers/models/flava/configuration_flava.py +5 -6
  466. transformers/models/flava/image_processing_flava.py +66 -67
  467. transformers/models/flava/image_processing_flava_fast.py +42 -43
  468. transformers/models/flava/modeling_flava.py +108 -107
  469. transformers/models/flava/processing_flava.py +2 -12
  470. transformers/models/flex_olmo/__init__.py +0 -1
  471. transformers/models/flex_olmo/configuration_flex_olmo.py +23 -25
  472. transformers/models/flex_olmo/modeling_flex_olmo.py +37 -39
  473. transformers/models/flex_olmo/modular_flex_olmo.py +35 -37
  474. transformers/models/florence2/configuration_florence2.py +0 -1
  475. transformers/models/florence2/modeling_florence2.py +39 -40
  476. transformers/models/florence2/modular_florence2.py +52 -81
  477. transformers/models/florence2/processing_florence2.py +18 -47
  478. transformers/models/fnet/configuration_fnet.py +0 -1
  479. transformers/models/fnet/modeling_fnet.py +69 -80
  480. transformers/models/fnet/tokenization_fnet.py +0 -1
  481. transformers/models/focalnet/configuration_focalnet.py +0 -1
  482. transformers/models/focalnet/modeling_focalnet.py +39 -41
  483. transformers/models/fsmt/configuration_fsmt.py +0 -1
  484. transformers/models/fsmt/modeling_fsmt.py +47 -48
  485. transformers/models/fsmt/tokenization_fsmt.py +3 -5
  486. transformers/models/funnel/configuration_funnel.py +0 -1
  487. transformers/models/funnel/modeling_funnel.py +91 -93
  488. transformers/models/funnel/tokenization_funnel.py +2 -5
  489. transformers/models/fuyu/configuration_fuyu.py +23 -26
  490. transformers/models/fuyu/image_processing_fuyu.py +29 -31
  491. transformers/models/fuyu/image_processing_fuyu_fast.py +12 -13
  492. transformers/models/fuyu/modeling_fuyu.py +26 -29
  493. transformers/models/fuyu/processing_fuyu.py +9 -36
  494. transformers/models/gemma/configuration_gemma.py +20 -23
  495. transformers/models/gemma/modeling_gemma.py +32 -34
  496. transformers/models/gemma/modular_gemma.py +28 -29
  497. transformers/models/gemma/tokenization_gemma.py +3 -6
  498. transformers/models/gemma2/configuration_gemma2.py +25 -28
  499. transformers/models/gemma2/modeling_gemma2.py +34 -37
  500. transformers/models/gemma2/modular_gemma2.py +55 -57
  501. transformers/models/gemma3/configuration_gemma3.py +28 -29
  502. transformers/models/gemma3/image_processing_gemma3.py +29 -31
  503. transformers/models/gemma3/image_processing_gemma3_fast.py +9 -10
  504. transformers/models/gemma3/modeling_gemma3.py +86 -89
  505. transformers/models/gemma3/modular_gemma3.py +85 -86
  506. transformers/models/gemma3/processing_gemma3.py +5 -5
  507. transformers/models/gemma3n/configuration_gemma3n.py +9 -10
  508. transformers/models/gemma3n/feature_extraction_gemma3n.py +9 -11
  509. transformers/models/gemma3n/modeling_gemma3n.py +80 -89
  510. transformers/models/gemma3n/modular_gemma3n.py +66 -75
  511. transformers/models/gemma3n/processing_gemma3n.py +12 -26
  512. transformers/models/git/configuration_git.py +0 -1
  513. transformers/models/git/modeling_git.py +84 -86
  514. transformers/models/git/processing_git.py +2 -14
  515. transformers/models/glm/configuration_glm.py +19 -21
  516. transformers/models/glm/modeling_glm.py +32 -35
  517. transformers/models/glm/modular_glm.py +4 -7
  518. transformers/models/glm4/configuration_glm4.py +19 -21
  519. transformers/models/glm4/modeling_glm4.py +35 -37
  520. transformers/models/glm4/modular_glm4.py +8 -10
  521. transformers/models/glm46v/configuration_glm46v.py +0 -1
  522. transformers/models/glm46v/image_processing_glm46v.py +35 -36
  523. transformers/models/glm46v/image_processing_glm46v_fast.py +7 -7
  524. transformers/models/glm46v/modeling_glm46v.py +51 -51
  525. transformers/models/glm46v/modular_glm46v.py +1 -3
  526. transformers/models/glm46v/processing_glm46v.py +7 -41
  527. transformers/models/glm46v/video_processing_glm46v.py +9 -11
  528. transformers/models/glm4_moe/configuration_glm4_moe.py +25 -28
  529. transformers/models/glm4_moe/modeling_glm4_moe.py +32 -35
  530. transformers/models/glm4_moe/modular_glm4_moe.py +26 -29
  531. transformers/models/glm4_moe_lite/__init__.py +28 -0
  532. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +235 -0
  533. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +740 -0
  534. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +304 -0
  535. transformers/models/glm4v/configuration_glm4v.py +14 -17
  536. transformers/models/glm4v/image_processing_glm4v.py +34 -36
  537. transformers/models/glm4v/image_processing_glm4v_fast.py +6 -7
  538. transformers/models/glm4v/modeling_glm4v.py +133 -151
  539. transformers/models/glm4v/modular_glm4v.py +131 -182
  540. transformers/models/glm4v/processing_glm4v.py +7 -41
  541. transformers/models/glm4v/video_processing_glm4v.py +9 -11
  542. transformers/models/glm4v_moe/configuration_glm4v_moe.py +119 -122
  543. transformers/models/glm4v_moe/modeling_glm4v_moe.py +237 -297
  544. transformers/models/glm4v_moe/modular_glm4v_moe.py +54 -163
  545. transformers/models/glm_image/__init__.py +31 -0
  546. transformers/models/glm_image/configuration_glm_image.py +352 -0
  547. transformers/models/glm_image/image_processing_glm_image.py +503 -0
  548. transformers/models/glm_image/image_processing_glm_image_fast.py +296 -0
  549. transformers/models/glm_image/modeling_glm_image.py +1590 -0
  550. transformers/models/glm_image/modular_glm_image.py +1480 -0
  551. transformers/models/glm_image/processing_glm_image.py +217 -0
  552. transformers/models/glmasr/__init__.py +0 -1
  553. transformers/models/glmasr/configuration_glmasr.py +0 -1
  554. transformers/models/glmasr/modeling_glmasr.py +17 -18
  555. transformers/models/glmasr/modular_glmasr.py +16 -18
  556. transformers/models/glmasr/processing_glmasr.py +7 -8
  557. transformers/models/glpn/configuration_glpn.py +0 -1
  558. transformers/models/glpn/image_processing_glpn.py +11 -12
  559. transformers/models/glpn/image_processing_glpn_fast.py +8 -9
  560. transformers/models/glpn/modeling_glpn.py +10 -12
  561. transformers/models/got_ocr2/configuration_got_ocr2.py +5 -8
  562. transformers/models/got_ocr2/image_processing_got_ocr2.py +22 -24
  563. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +6 -7
  564. transformers/models/got_ocr2/modeling_got_ocr2.py +40 -42
  565. transformers/models/got_ocr2/modular_got_ocr2.py +31 -34
  566. transformers/models/got_ocr2/processing_got_ocr2.py +42 -63
  567. transformers/models/gpt2/configuration_gpt2.py +0 -1
  568. transformers/models/gpt2/modeling_gpt2.py +106 -108
  569. transformers/models/gpt2/tokenization_gpt2.py +6 -9
  570. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +0 -1
  571. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +73 -80
  572. transformers/models/gpt_neo/configuration_gpt_neo.py +0 -1
  573. transformers/models/gpt_neo/modeling_gpt_neo.py +63 -64
  574. transformers/models/gpt_neox/configuration_gpt_neox.py +19 -22
  575. transformers/models/gpt_neox/modeling_gpt_neox.py +70 -72
  576. transformers/models/gpt_neox/modular_gpt_neox.py +64 -66
  577. transformers/models/gpt_neox/tokenization_gpt_neox.py +2 -5
  578. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +15 -18
  579. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +41 -44
  580. transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +1 -3
  581. transformers/models/gpt_oss/configuration_gpt_oss.py +21 -24
  582. transformers/models/gpt_oss/modeling_gpt_oss.py +34 -35
  583. transformers/models/gpt_oss/modular_gpt_oss.py +17 -19
  584. transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
  585. transformers/models/gptj/configuration_gptj.py +0 -1
  586. transformers/models/gptj/modeling_gptj.py +82 -81
  587. transformers/models/granite/configuration_granite.py +23 -26
  588. transformers/models/granite/modeling_granite.py +39 -41
  589. transformers/models/granite/modular_granite.py +29 -31
  590. transformers/models/granite_speech/configuration_granite_speech.py +0 -1
  591. transformers/models/granite_speech/feature_extraction_granite_speech.py +1 -3
  592. transformers/models/granite_speech/modeling_granite_speech.py +21 -23
  593. transformers/models/granite_speech/processing_granite_speech.py +11 -4
  594. transformers/models/granitemoe/configuration_granitemoe.py +26 -29
  595. transformers/models/granitemoe/modeling_granitemoe.py +35 -37
  596. transformers/models/granitemoe/modular_granitemoe.py +21 -23
  597. transformers/models/granitemoehybrid/__init__.py +0 -1
  598. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +38 -41
  599. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +60 -64
  600. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +18 -20
  601. transformers/models/granitemoeshared/configuration_granitemoeshared.py +27 -30
  602. transformers/models/granitemoeshared/modeling_granitemoeshared.py +48 -52
  603. transformers/models/granitemoeshared/modular_granitemoeshared.py +19 -21
  604. transformers/models/grounding_dino/configuration_grounding_dino.py +0 -1
  605. transformers/models/grounding_dino/image_processing_grounding_dino.py +60 -62
  606. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +17 -18
  607. transformers/models/grounding_dino/modeling_grounding_dino.py +94 -96
  608. transformers/models/grounding_dino/modular_grounding_dino.py +2 -3
  609. transformers/models/grounding_dino/processing_grounding_dino.py +10 -38
  610. transformers/models/groupvit/configuration_groupvit.py +0 -1
  611. transformers/models/groupvit/modeling_groupvit.py +69 -70
  612. transformers/models/helium/configuration_helium.py +20 -22
  613. transformers/models/helium/modeling_helium.py +33 -36
  614. transformers/models/helium/modular_helium.py +3 -7
  615. transformers/models/herbert/tokenization_herbert.py +4 -6
  616. transformers/models/hgnet_v2/configuration_hgnet_v2.py +0 -1
  617. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -9
  618. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -9
  619. transformers/models/hiera/configuration_hiera.py +0 -1
  620. transformers/models/hiera/modeling_hiera.py +60 -62
  621. transformers/models/hubert/configuration_hubert.py +0 -1
  622. transformers/models/hubert/modeling_hubert.py +35 -37
  623. transformers/models/hubert/modular_hubert.py +8 -11
  624. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +21 -24
  625. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +30 -33
  626. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +3 -5
  627. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +25 -28
  628. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +32 -35
  629. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +5 -7
  630. transformers/models/ibert/configuration_ibert.py +0 -1
  631. transformers/models/ibert/modeling_ibert.py +60 -62
  632. transformers/models/ibert/quant_modules.py +0 -1
  633. transformers/models/idefics/configuration_idefics.py +0 -1
  634. transformers/models/idefics/image_processing_idefics.py +13 -15
  635. transformers/models/idefics/modeling_idefics.py +60 -61
  636. transformers/models/idefics/perceiver.py +1 -3
  637. transformers/models/idefics/processing_idefics.py +32 -48
  638. transformers/models/idefics/vision.py +22 -24
  639. transformers/models/idefics2/configuration_idefics2.py +0 -1
  640. transformers/models/idefics2/image_processing_idefics2.py +31 -32
  641. transformers/models/idefics2/image_processing_idefics2_fast.py +7 -8
  642. transformers/models/idefics2/modeling_idefics2.py +56 -58
  643. transformers/models/idefics2/processing_idefics2.py +10 -68
  644. transformers/models/idefics3/configuration_idefics3.py +0 -1
  645. transformers/models/idefics3/image_processing_idefics3.py +42 -43
  646. transformers/models/idefics3/image_processing_idefics3_fast.py +11 -12
  647. transformers/models/idefics3/modeling_idefics3.py +52 -54
  648. transformers/models/idefics3/processing_idefics3.py +15 -69
  649. transformers/models/ijepa/configuration_ijepa.py +0 -1
  650. transformers/models/ijepa/modeling_ijepa.py +10 -11
  651. transformers/models/ijepa/modular_ijepa.py +5 -7
  652. transformers/models/imagegpt/configuration_imagegpt.py +0 -1
  653. transformers/models/imagegpt/image_processing_imagegpt.py +17 -18
  654. transformers/models/imagegpt/image_processing_imagegpt_fast.py +8 -9
  655. transformers/models/imagegpt/modeling_imagegpt.py +57 -58
  656. transformers/models/informer/configuration_informer.py +6 -9
  657. transformers/models/informer/modeling_informer.py +84 -86
  658. transformers/models/informer/modular_informer.py +13 -16
  659. transformers/models/instructblip/configuration_instructblip.py +0 -1
  660. transformers/models/instructblip/modeling_instructblip.py +43 -44
  661. transformers/models/instructblip/processing_instructblip.py +10 -36
  662. transformers/models/instructblipvideo/configuration_instructblipvideo.py +0 -1
  663. transformers/models/instructblipvideo/modeling_instructblipvideo.py +55 -55
  664. transformers/models/instructblipvideo/modular_instructblipvideo.py +34 -36
  665. transformers/models/instructblipvideo/processing_instructblipvideo.py +14 -33
  666. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +4 -5
  667. transformers/models/internvl/configuration_internvl.py +0 -1
  668. transformers/models/internvl/modeling_internvl.py +41 -43
  669. transformers/models/internvl/modular_internvl.py +19 -21
  670. transformers/models/internvl/processing_internvl.py +12 -45
  671. transformers/models/internvl/video_processing_internvl.py +8 -9
  672. transformers/models/jais2/configuration_jais2.py +20 -22
  673. transformers/models/jais2/modeling_jais2.py +32 -34
  674. transformers/models/jais2/modular_jais2.py +20 -22
  675. transformers/models/jamba/configuration_jamba.py +0 -1
  676. transformers/models/jamba/modeling_jamba.py +43 -46
  677. transformers/models/jamba/modular_jamba.py +37 -38
  678. transformers/models/janus/configuration_janus.py +0 -1
  679. transformers/models/janus/image_processing_janus.py +35 -37
  680. transformers/models/janus/image_processing_janus_fast.py +12 -13
  681. transformers/models/janus/modeling_janus.py +41 -43
  682. transformers/models/janus/modular_janus.py +60 -63
  683. transformers/models/janus/processing_janus.py +17 -43
  684. transformers/models/jetmoe/configuration_jetmoe.py +20 -23
  685. transformers/models/jetmoe/modeling_jetmoe.py +39 -42
  686. transformers/models/jetmoe/modular_jetmoe.py +30 -33
  687. transformers/models/kosmos2/configuration_kosmos2.py +0 -1
  688. transformers/models/kosmos2/modeling_kosmos2.py +145 -146
  689. transformers/models/kosmos2/processing_kosmos2.py +40 -55
  690. transformers/models/kosmos2_5/__init__.py +0 -1
  691. transformers/models/kosmos2_5/configuration_kosmos2_5.py +0 -1
  692. transformers/models/kosmos2_5/image_processing_kosmos2_5.py +10 -12
  693. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -11
  694. transformers/models/kosmos2_5/modeling_kosmos2_5.py +108 -109
  695. transformers/models/kosmos2_5/processing_kosmos2_5.py +8 -29
  696. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +23 -25
  697. transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +12 -14
  698. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +59 -66
  699. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +19 -21
  700. transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +2 -8
  701. transformers/models/lasr/configuration_lasr.py +1 -3
  702. transformers/models/lasr/feature_extraction_lasr.py +10 -12
  703. transformers/models/lasr/modeling_lasr.py +18 -21
  704. transformers/models/lasr/modular_lasr.py +8 -10
  705. transformers/models/lasr/processing_lasr.py +12 -6
  706. transformers/models/lasr/tokenization_lasr.py +2 -4
  707. transformers/models/layoutlm/configuration_layoutlm.py +0 -1
  708. transformers/models/layoutlm/modeling_layoutlm.py +67 -69
  709. transformers/models/layoutlmv2/configuration_layoutlmv2.py +0 -1
  710. transformers/models/layoutlmv2/image_processing_layoutlmv2.py +18 -21
  711. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +5 -6
  712. transformers/models/layoutlmv2/modeling_layoutlmv2.py +48 -50
  713. transformers/models/layoutlmv2/processing_layoutlmv2.py +14 -44
  714. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +63 -74
  715. transformers/models/layoutlmv3/configuration_layoutlmv3.py +0 -1
  716. transformers/models/layoutlmv3/image_processing_layoutlmv3.py +24 -26
  717. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +7 -8
  718. transformers/models/layoutlmv3/modeling_layoutlmv3.py +49 -51
  719. transformers/models/layoutlmv3/processing_layoutlmv3.py +14 -46
  720. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +64 -75
  721. transformers/models/layoutxlm/configuration_layoutxlm.py +0 -1
  722. transformers/models/layoutxlm/modular_layoutxlm.py +0 -1
  723. transformers/models/layoutxlm/processing_layoutxlm.py +14 -44
  724. transformers/models/layoutxlm/tokenization_layoutxlm.py +65 -76
  725. transformers/models/led/configuration_led.py +1 -4
  726. transformers/models/led/modeling_led.py +113 -267
  727. transformers/models/levit/configuration_levit.py +0 -1
  728. transformers/models/levit/image_processing_levit.py +19 -21
  729. transformers/models/levit/image_processing_levit_fast.py +0 -1
  730. transformers/models/levit/modeling_levit.py +17 -19
  731. transformers/models/lfm2/configuration_lfm2.py +22 -23
  732. transformers/models/lfm2/modeling_lfm2.py +42 -44
  733. transformers/models/lfm2/modular_lfm2.py +29 -29
  734. transformers/models/lfm2_moe/__init__.py +0 -1
  735. transformers/models/lfm2_moe/configuration_lfm2_moe.py +1 -2
  736. transformers/models/lfm2_moe/modeling_lfm2_moe.py +44 -45
  737. transformers/models/lfm2_moe/modular_lfm2_moe.py +8 -9
  738. transformers/models/lfm2_vl/configuration_lfm2_vl.py +0 -1
  739. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +34 -5
  740. transformers/models/lfm2_vl/modeling_lfm2_vl.py +31 -33
  741. transformers/models/lfm2_vl/modular_lfm2_vl.py +24 -27
  742. transformers/models/lfm2_vl/processing_lfm2_vl.py +14 -34
  743. transformers/models/lightglue/image_processing_lightglue.py +16 -15
  744. transformers/models/lightglue/image_processing_lightglue_fast.py +4 -4
  745. transformers/models/lightglue/modeling_lightglue.py +28 -30
  746. transformers/models/lightglue/modular_lightglue.py +28 -28
  747. transformers/models/lighton_ocr/__init__.py +28 -0
  748. transformers/models/lighton_ocr/configuration_lighton_ocr.py +128 -0
  749. transformers/models/lighton_ocr/modeling_lighton_ocr.py +460 -0
  750. transformers/models/lighton_ocr/modular_lighton_ocr.py +403 -0
  751. transformers/models/lighton_ocr/processing_lighton_ocr.py +229 -0
  752. transformers/models/lilt/configuration_lilt.py +0 -1
  753. transformers/models/lilt/modeling_lilt.py +53 -55
  754. transformers/models/llama/configuration_llama.py +21 -24
  755. transformers/models/llama/modeling_llama.py +31 -34
  756. transformers/models/llama/tokenization_llama.py +2 -4
  757. transformers/models/llama4/configuration_llama4.py +20 -22
  758. transformers/models/llama4/image_processing_llama4_fast.py +8 -9
  759. transformers/models/llama4/modeling_llama4.py +70 -71
  760. transformers/models/llama4/processing_llama4.py +33 -57
  761. transformers/models/llava/configuration_llava.py +0 -1
  762. transformers/models/llava/image_processing_llava.py +25 -28
  763. transformers/models/llava/image_processing_llava_fast.py +6 -7
  764. transformers/models/llava/modeling_llava.py +35 -37
  765. transformers/models/llava/processing_llava.py +18 -51
  766. transformers/models/llava_next/configuration_llava_next.py +0 -1
  767. transformers/models/llava_next/image_processing_llava_next.py +43 -45
  768. transformers/models/llava_next/image_processing_llava_next_fast.py +5 -6
  769. transformers/models/llava_next/modeling_llava_next.py +42 -44
  770. transformers/models/llava_next/processing_llava_next.py +18 -47
  771. transformers/models/llava_next_video/configuration_llava_next_video.py +0 -1
  772. transformers/models/llava_next_video/modeling_llava_next_video.py +53 -55
  773. transformers/models/llava_next_video/modular_llava_next_video.py +44 -46
  774. transformers/models/llava_next_video/processing_llava_next_video.py +21 -63
  775. transformers/models/llava_next_video/video_processing_llava_next_video.py +0 -1
  776. transformers/models/llava_onevision/configuration_llava_onevision.py +0 -1
  777. transformers/models/llava_onevision/image_processing_llava_onevision.py +40 -42
  778. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +6 -7
  779. transformers/models/llava_onevision/modeling_llava_onevision.py +60 -62
  780. transformers/models/llava_onevision/modular_llava_onevision.py +51 -52
  781. transformers/models/llava_onevision/processing_llava_onevision.py +21 -53
  782. transformers/models/llava_onevision/video_processing_llava_onevision.py +0 -1
  783. transformers/models/longcat_flash/__init__.py +0 -1
  784. transformers/models/longcat_flash/configuration_longcat_flash.py +32 -35
  785. transformers/models/longcat_flash/modeling_longcat_flash.py +30 -31
  786. transformers/models/longcat_flash/modular_longcat_flash.py +17 -19
  787. transformers/models/longformer/configuration_longformer.py +1 -4
  788. transformers/models/longformer/modeling_longformer.py +99 -101
  789. transformers/models/longt5/configuration_longt5.py +0 -1
  790. transformers/models/longt5/modeling_longt5.py +43 -44
  791. transformers/models/luke/configuration_luke.py +0 -1
  792. transformers/models/luke/modeling_luke.py +179 -181
  793. transformers/models/luke/tokenization_luke.py +99 -105
  794. transformers/models/lw_detr/__init__.py +27 -0
  795. transformers/models/lw_detr/configuration_lw_detr.py +374 -0
  796. transformers/models/lw_detr/modeling_lw_detr.py +1698 -0
  797. transformers/models/lw_detr/modular_lw_detr.py +1611 -0
  798. transformers/models/lxmert/configuration_lxmert.py +0 -1
  799. transformers/models/lxmert/modeling_lxmert.py +63 -74
  800. transformers/models/m2m_100/configuration_m2m_100.py +0 -1
  801. transformers/models/m2m_100/modeling_m2m_100.py +69 -71
  802. transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
  803. transformers/models/mamba/configuration_mamba.py +0 -1
  804. transformers/models/mamba/modeling_mamba.py +43 -44
  805. transformers/models/mamba2/configuration_mamba2.py +0 -1
  806. transformers/models/mamba2/modeling_mamba2.py +44 -46
  807. transformers/models/marian/configuration_marian.py +0 -1
  808. transformers/models/marian/modeling_marian.py +84 -86
  809. transformers/models/marian/tokenization_marian.py +6 -6
  810. transformers/models/markuplm/configuration_markuplm.py +0 -1
  811. transformers/models/markuplm/feature_extraction_markuplm.py +1 -2
  812. transformers/models/markuplm/modeling_markuplm.py +60 -62
  813. transformers/models/markuplm/processing_markuplm.py +31 -38
  814. transformers/models/markuplm/tokenization_markuplm.py +67 -77
  815. transformers/models/mask2former/configuration_mask2former.py +4 -7
  816. transformers/models/mask2former/image_processing_mask2former.py +84 -85
  817. transformers/models/mask2former/image_processing_mask2former_fast.py +29 -29
  818. transformers/models/mask2former/modeling_mask2former.py +90 -92
  819. transformers/models/mask2former/modular_mask2former.py +6 -8
  820. transformers/models/maskformer/configuration_maskformer.py +5 -8
  821. transformers/models/maskformer/configuration_maskformer_swin.py +0 -1
  822. transformers/models/maskformer/image_processing_maskformer.py +84 -85
  823. transformers/models/maskformer/image_processing_maskformer_fast.py +28 -29
  824. transformers/models/maskformer/modeling_maskformer.py +56 -58
  825. transformers/models/maskformer/modeling_maskformer_swin.py +18 -20
  826. transformers/models/mbart/configuration_mbart.py +0 -1
  827. transformers/models/mbart/modeling_mbart.py +111 -113
  828. transformers/models/mbart/tokenization_mbart.py +2 -4
  829. transformers/models/mbart50/tokenization_mbart50.py +3 -5
  830. transformers/models/megatron_bert/configuration_megatron_bert.py +0 -1
  831. transformers/models/megatron_bert/modeling_megatron_bert.py +139 -150
  832. transformers/models/metaclip_2/modeling_metaclip_2.py +46 -46
  833. transformers/models/metaclip_2/modular_metaclip_2.py +19 -21
  834. transformers/models/mgp_str/configuration_mgp_str.py +0 -1
  835. transformers/models/mgp_str/modeling_mgp_str.py +14 -16
  836. transformers/models/mgp_str/processing_mgp_str.py +3 -20
  837. transformers/models/mgp_str/tokenization_mgp_str.py +1 -3
  838. transformers/models/mimi/configuration_mimi.py +38 -40
  839. transformers/models/mimi/modeling_mimi.py +76 -79
  840. transformers/models/minimax/__init__.py +0 -1
  841. transformers/models/minimax/configuration_minimax.py +32 -36
  842. transformers/models/minimax/modeling_minimax.py +41 -44
  843. transformers/models/minimax/modular_minimax.py +50 -53
  844. transformers/models/minimax_m2/__init__.py +28 -0
  845. transformers/models/minimax_m2/configuration_minimax_m2.py +211 -0
  846. transformers/models/minimax_m2/modeling_minimax_m2.py +704 -0
  847. transformers/models/minimax_m2/modular_minimax_m2.py +369 -0
  848. transformers/models/ministral/configuration_ministral.py +20 -22
  849. transformers/models/ministral/modeling_ministral.py +31 -33
  850. transformers/models/ministral/modular_ministral.py +27 -29
  851. transformers/models/ministral3/configuration_ministral3.py +19 -22
  852. transformers/models/ministral3/modeling_ministral3.py +31 -33
  853. transformers/models/ministral3/modular_ministral3.py +4 -5
  854. transformers/models/mistral/configuration_mistral.py +19 -22
  855. transformers/models/mistral/modeling_mistral.py +31 -33
  856. transformers/models/mistral/modular_mistral.py +11 -12
  857. transformers/models/mistral3/configuration_mistral3.py +0 -1
  858. transformers/models/mistral3/modeling_mistral3.py +43 -42
  859. transformers/models/mistral3/modular_mistral3.py +35 -35
  860. transformers/models/mixtral/configuration_mixtral.py +24 -27
  861. transformers/models/mixtral/modeling_mixtral.py +35 -38
  862. transformers/models/mixtral/modular_mixtral.py +26 -29
  863. transformers/models/mlcd/configuration_mlcd.py +0 -1
  864. transformers/models/mlcd/modeling_mlcd.py +10 -12
  865. transformers/models/mlcd/modular_mlcd.py +9 -11
  866. transformers/models/mllama/configuration_mllama.py +5 -8
  867. transformers/models/mllama/image_processing_mllama.py +23 -25
  868. transformers/models/mllama/image_processing_mllama_fast.py +5 -6
  869. transformers/models/mllama/modeling_mllama.py +81 -84
  870. transformers/models/mllama/processing_mllama.py +6 -55
  871. transformers/models/mluke/tokenization_mluke.py +97 -103
  872. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +0 -1
  873. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +94 -96
  874. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +0 -1
  875. transformers/models/mobilebert/configuration_mobilebert.py +0 -1
  876. transformers/models/mobilebert/modeling_mobilebert.py +75 -85
  877. transformers/models/mobilebert/tokenization_mobilebert.py +0 -1
  878. transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +0 -1
  879. transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +20 -23
  880. transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +0 -1
  881. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +13 -16
  882. transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +0 -1
  883. transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +48 -51
  884. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +10 -11
  885. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +17 -20
  886. transformers/models/mobilevit/configuration_mobilevit.py +0 -1
  887. transformers/models/mobilevit/image_processing_mobilevit.py +41 -44
  888. transformers/models/mobilevit/image_processing_mobilevit_fast.py +8 -9
  889. transformers/models/mobilevit/modeling_mobilevit.py +17 -19
  890. transformers/models/mobilevitv2/configuration_mobilevitv2.py +0 -1
  891. transformers/models/mobilevitv2/modeling_mobilevitv2.py +17 -20
  892. transformers/models/modernbert/configuration_modernbert.py +34 -34
  893. transformers/models/modernbert/modeling_modernbert.py +123 -125
  894. transformers/models/modernbert/modular_modernbert.py +155 -155
  895. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +30 -32
  896. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +45 -47
  897. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +69 -70
  898. transformers/models/moonshine/configuration_moonshine.py +22 -24
  899. transformers/models/moonshine/modeling_moonshine.py +63 -65
  900. transformers/models/moonshine/modular_moonshine.py +72 -73
  901. transformers/models/moshi/configuration_moshi.py +18 -21
  902. transformers/models/moshi/modeling_moshi.py +130 -133
  903. transformers/models/mpnet/configuration_mpnet.py +0 -1
  904. transformers/models/mpnet/modeling_mpnet.py +55 -57
  905. transformers/models/mpnet/tokenization_mpnet.py +1 -4
  906. transformers/models/mpt/configuration_mpt.py +1 -9
  907. transformers/models/mpt/modeling_mpt.py +58 -60
  908. transformers/models/mra/configuration_mra.py +0 -1
  909. transformers/models/mra/modeling_mra.py +54 -56
  910. transformers/models/mt5/configuration_mt5.py +0 -1
  911. transformers/models/mt5/modeling_mt5.py +75 -77
  912. transformers/models/musicgen/configuration_musicgen.py +0 -1
  913. transformers/models/musicgen/modeling_musicgen.py +108 -111
  914. transformers/models/musicgen/processing_musicgen.py +3 -21
  915. transformers/models/musicgen_melody/configuration_musicgen_melody.py +0 -1
  916. transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +8 -9
  917. transformers/models/musicgen_melody/modeling_musicgen_melody.py +106 -109
  918. transformers/models/musicgen_melody/processing_musicgen_melody.py +3 -22
  919. transformers/models/mvp/configuration_mvp.py +0 -1
  920. transformers/models/mvp/modeling_mvp.py +115 -119
  921. transformers/models/myt5/tokenization_myt5.py +8 -10
  922. transformers/models/nanochat/configuration_nanochat.py +0 -1
  923. transformers/models/nanochat/modeling_nanochat.py +32 -35
  924. transformers/models/nanochat/modular_nanochat.py +12 -14
  925. transformers/models/nemotron/configuration_nemotron.py +20 -23
  926. transformers/models/nemotron/modeling_nemotron.py +49 -52
  927. transformers/models/nllb/tokenization_nllb.py +7 -9
  928. transformers/models/nllb_moe/configuration_nllb_moe.py +0 -1
  929. transformers/models/nllb_moe/modeling_nllb_moe.py +67 -69
  930. transformers/models/nougat/image_processing_nougat.py +29 -32
  931. transformers/models/nougat/image_processing_nougat_fast.py +4 -5
  932. transformers/models/nougat/processing_nougat.py +37 -39
  933. transformers/models/nougat/tokenization_nougat.py +5 -7
  934. transformers/models/nystromformer/configuration_nystromformer.py +0 -1
  935. transformers/models/nystromformer/modeling_nystromformer.py +61 -63
  936. transformers/models/olmo/configuration_olmo.py +18 -21
  937. transformers/models/olmo/modeling_olmo.py +31 -34
  938. transformers/models/olmo/modular_olmo.py +5 -9
  939. transformers/models/olmo2/configuration_olmo2.py +18 -21
  940. transformers/models/olmo2/modeling_olmo2.py +32 -35
  941. transformers/models/olmo2/modular_olmo2.py +29 -31
  942. transformers/models/olmo3/__init__.py +0 -1
  943. transformers/models/olmo3/configuration_olmo3.py +20 -23
  944. transformers/models/olmo3/modeling_olmo3.py +31 -34
  945. transformers/models/olmo3/modular_olmo3.py +31 -33
  946. transformers/models/olmoe/configuration_olmoe.py +24 -26
  947. transformers/models/olmoe/modeling_olmoe.py +37 -39
  948. transformers/models/olmoe/modular_olmoe.py +12 -13
  949. transformers/models/omdet_turbo/configuration_omdet_turbo.py +0 -1
  950. transformers/models/omdet_turbo/modeling_omdet_turbo.py +38 -40
  951. transformers/models/omdet_turbo/processing_omdet_turbo.py +19 -67
  952. transformers/models/oneformer/configuration_oneformer.py +4 -7
  953. transformers/models/oneformer/image_processing_oneformer.py +83 -84
  954. transformers/models/oneformer/image_processing_oneformer_fast.py +33 -34
  955. transformers/models/oneformer/modeling_oneformer.py +123 -124
  956. transformers/models/oneformer/processing_oneformer.py +28 -43
  957. transformers/models/openai/configuration_openai.py +0 -1
  958. transformers/models/openai/modeling_openai.py +50 -51
  959. transformers/models/openai/tokenization_openai.py +2 -5
  960. transformers/models/opt/configuration_opt.py +0 -1
  961. transformers/models/opt/modeling_opt.py +74 -75
  962. transformers/models/ovis2/__init__.py +0 -1
  963. transformers/models/ovis2/configuration_ovis2.py +0 -1
  964. transformers/models/ovis2/image_processing_ovis2.py +22 -24
  965. transformers/models/ovis2/image_processing_ovis2_fast.py +6 -7
  966. transformers/models/ovis2/modeling_ovis2.py +43 -45
  967. transformers/models/ovis2/modular_ovis2.py +30 -32
  968. transformers/models/ovis2/processing_ovis2.py +12 -40
  969. transformers/models/owlv2/configuration_owlv2.py +0 -1
  970. transformers/models/owlv2/image_processing_owlv2.py +20 -21
  971. transformers/models/owlv2/image_processing_owlv2_fast.py +7 -8
  972. transformers/models/owlv2/modeling_owlv2.py +82 -87
  973. transformers/models/owlv2/modular_owlv2.py +6 -7
  974. transformers/models/owlv2/processing_owlv2.py +20 -49
  975. transformers/models/owlvit/configuration_owlvit.py +0 -1
  976. transformers/models/owlvit/image_processing_owlvit.py +21 -22
  977. transformers/models/owlvit/image_processing_owlvit_fast.py +2 -3
  978. transformers/models/owlvit/modeling_owlvit.py +81 -86
  979. transformers/models/owlvit/processing_owlvit.py +20 -48
  980. transformers/models/paddleocr_vl/__init__.py +0 -1
  981. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +19 -19
  982. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +34 -35
  983. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +12 -12
  984. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +76 -76
  985. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +68 -68
  986. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +1 -3
  987. transformers/models/paligemma/configuration_paligemma.py +0 -1
  988. transformers/models/paligemma/modeling_paligemma.py +51 -53
  989. transformers/models/paligemma/processing_paligemma.py +13 -66
  990. transformers/models/parakeet/configuration_parakeet.py +1 -4
  991. transformers/models/parakeet/feature_extraction_parakeet.py +10 -12
  992. transformers/models/parakeet/modeling_parakeet.py +18 -22
  993. transformers/models/parakeet/modular_parakeet.py +16 -18
  994. transformers/models/parakeet/processing_parakeet.py +12 -5
  995. transformers/models/parakeet/tokenization_parakeet.py +2 -4
  996. transformers/models/patchtsmixer/configuration_patchtsmixer.py +5 -8
  997. transformers/models/patchtsmixer/modeling_patchtsmixer.py +60 -62
  998. transformers/models/patchtst/configuration_patchtst.py +6 -9
  999. transformers/models/patchtst/modeling_patchtst.py +72 -74
  1000. transformers/models/pe_audio/__init__.py +0 -1
  1001. transformers/models/pe_audio/configuration_pe_audio.py +14 -16
  1002. transformers/models/pe_audio/feature_extraction_pe_audio.py +6 -8
  1003. transformers/models/pe_audio/modeling_pe_audio.py +26 -27
  1004. transformers/models/pe_audio/modular_pe_audio.py +16 -17
  1005. transformers/models/pe_audio/processing_pe_audio.py +0 -1
  1006. transformers/models/pe_audio_video/__init__.py +0 -1
  1007. transformers/models/pe_audio_video/configuration_pe_audio_video.py +15 -17
  1008. transformers/models/pe_audio_video/modeling_pe_audio_video.py +60 -61
  1009. transformers/models/pe_audio_video/modular_pe_audio_video.py +52 -53
  1010. transformers/models/pe_audio_video/processing_pe_audio_video.py +0 -1
  1011. transformers/models/pe_video/__init__.py +0 -1
  1012. transformers/models/pe_video/configuration_pe_video.py +14 -16
  1013. transformers/models/pe_video/modeling_pe_video.py +21 -22
  1014. transformers/models/pe_video/modular_pe_video.py +11 -12
  1015. transformers/models/pe_video/video_processing_pe_video.py +2 -4
  1016. transformers/models/pegasus/configuration_pegasus.py +0 -1
  1017. transformers/models/pegasus/modeling_pegasus.py +63 -65
  1018. transformers/models/pegasus/tokenization_pegasus.py +1 -4
  1019. transformers/models/pegasus_x/configuration_pegasus_x.py +0 -1
  1020. transformers/models/pegasus_x/modeling_pegasus_x.py +50 -52
  1021. transformers/models/perceiver/configuration_perceiver.py +0 -1
  1022. transformers/models/perceiver/image_processing_perceiver.py +22 -25
  1023. transformers/models/perceiver/image_processing_perceiver_fast.py +5 -6
  1024. transformers/models/perceiver/modeling_perceiver.py +135 -136
  1025. transformers/models/perceiver/tokenization_perceiver.py +3 -6
  1026. transformers/models/perception_lm/configuration_perception_lm.py +0 -1
  1027. transformers/models/perception_lm/image_processing_perception_lm_fast.py +8 -9
  1028. transformers/models/perception_lm/modeling_perception_lm.py +38 -40
  1029. transformers/models/perception_lm/modular_perception_lm.py +31 -33
  1030. transformers/models/perception_lm/processing_perception_lm.py +13 -47
  1031. transformers/models/perception_lm/video_processing_perception_lm.py +0 -1
  1032. transformers/models/persimmon/configuration_persimmon.py +18 -21
  1033. transformers/models/persimmon/modeling_persimmon.py +39 -42
  1034. transformers/models/phi/configuration_phi.py +19 -22
  1035. transformers/models/phi/modeling_phi.py +35 -37
  1036. transformers/models/phi/modular_phi.py +23 -23
  1037. transformers/models/phi3/configuration_phi3.py +23 -26
  1038. transformers/models/phi3/modeling_phi3.py +33 -36
  1039. transformers/models/phi3/modular_phi3.py +13 -17
  1040. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +25 -26
  1041. transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +7 -9
  1042. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +7 -7
  1043. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +54 -56
  1044. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +59 -60
  1045. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +7 -42
  1046. transformers/models/phimoe/configuration_phimoe.py +26 -29
  1047. transformers/models/phimoe/modeling_phimoe.py +35 -38
  1048. transformers/models/phimoe/modular_phimoe.py +0 -1
  1049. transformers/models/phobert/tokenization_phobert.py +4 -6
  1050. transformers/models/pix2struct/configuration_pix2struct.py +0 -1
  1051. transformers/models/pix2struct/image_processing_pix2struct.py +15 -19
  1052. transformers/models/pix2struct/image_processing_pix2struct_fast.py +7 -10
  1053. transformers/models/pix2struct/modeling_pix2struct.py +42 -45
  1054. transformers/models/pix2struct/processing_pix2struct.py +5 -26
  1055. transformers/models/pixio/__init__.py +0 -1
  1056. transformers/models/pixio/configuration_pixio.py +0 -1
  1057. transformers/models/pixio/modeling_pixio.py +7 -9
  1058. transformers/models/pixio/modular_pixio.py +3 -6
  1059. transformers/models/pixtral/configuration_pixtral.py +11 -14
  1060. transformers/models/pixtral/image_processing_pixtral.py +26 -28
  1061. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -6
  1062. transformers/models/pixtral/modeling_pixtral.py +22 -25
  1063. transformers/models/pixtral/processing_pixtral.py +18 -52
  1064. transformers/models/plbart/configuration_plbart.py +0 -1
  1065. transformers/models/plbart/modeling_plbart.py +100 -102
  1066. transformers/models/plbart/modular_plbart.py +30 -32
  1067. transformers/models/plbart/tokenization_plbart.py +4 -5
  1068. transformers/models/poolformer/configuration_poolformer.py +0 -1
  1069. transformers/models/poolformer/image_processing_poolformer.py +21 -24
  1070. transformers/models/poolformer/image_processing_poolformer_fast.py +6 -7
  1071. transformers/models/poolformer/modeling_poolformer.py +10 -12
  1072. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  1073. transformers/models/pop2piano/feature_extraction_pop2piano.py +6 -9
  1074. transformers/models/pop2piano/modeling_pop2piano.py +22 -23
  1075. transformers/models/pop2piano/processing_pop2piano.py +25 -33
  1076. transformers/models/pop2piano/tokenization_pop2piano.py +15 -23
  1077. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +1 -0
  1078. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
  1079. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +14 -15
  1080. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +9 -10
  1081. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +9 -10
  1082. transformers/models/prophetnet/configuration_prophetnet.py +26 -28
  1083. transformers/models/prophetnet/modeling_prophetnet.py +109 -130
  1084. transformers/models/prophetnet/tokenization_prophetnet.py +14 -16
  1085. transformers/models/pvt/configuration_pvt.py +0 -1
  1086. transformers/models/pvt/image_processing_pvt.py +17 -20
  1087. transformers/models/pvt/image_processing_pvt_fast.py +0 -1
  1088. transformers/models/pvt/modeling_pvt.py +19 -21
  1089. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  1090. transformers/models/pvt_v2/modeling_pvt_v2.py +21 -23
  1091. transformers/models/qwen2/configuration_qwen2.py +18 -21
  1092. transformers/models/qwen2/modeling_qwen2.py +31 -33
  1093. transformers/models/qwen2/modular_qwen2.py +11 -12
  1094. transformers/models/qwen2/tokenization_qwen2.py +2 -5
  1095. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +20 -23
  1096. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +135 -128
  1097. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +116 -109
  1098. transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +41 -49
  1099. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +22 -25
  1100. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +94 -96
  1101. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +46 -85
  1102. transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +7 -43
  1103. transformers/models/qwen2_audio/configuration_qwen2_audio.py +0 -1
  1104. transformers/models/qwen2_audio/modeling_qwen2_audio.py +27 -29
  1105. transformers/models/qwen2_audio/processing_qwen2_audio.py +13 -42
  1106. transformers/models/qwen2_moe/configuration_qwen2_moe.py +28 -31
  1107. transformers/models/qwen2_moe/modeling_qwen2_moe.py +36 -39
  1108. transformers/models/qwen2_moe/modular_qwen2_moe.py +7 -10
  1109. transformers/models/qwen2_vl/configuration_qwen2_vl.py +22 -24
  1110. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +38 -40
  1111. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +8 -9
  1112. transformers/models/qwen2_vl/modeling_qwen2_vl.py +91 -92
  1113. transformers/models/qwen2_vl/processing_qwen2_vl.py +7 -44
  1114. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +35 -13
  1115. transformers/models/qwen3/configuration_qwen3.py +20 -23
  1116. transformers/models/qwen3/modeling_qwen3.py +31 -34
  1117. transformers/models/qwen3/modular_qwen3.py +4 -6
  1118. transformers/models/qwen3_moe/configuration_qwen3_moe.py +25 -28
  1119. transformers/models/qwen3_moe/modeling_qwen3_moe.py +36 -39
  1120. transformers/models/qwen3_moe/modular_qwen3_moe.py +10 -13
  1121. transformers/models/qwen3_next/configuration_qwen3_next.py +31 -34
  1122. transformers/models/qwen3_next/modeling_qwen3_next.py +39 -42
  1123. transformers/models/qwen3_next/modular_qwen3_next.py +33 -34
  1124. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +85 -88
  1125. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +107 -110
  1126. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +122 -148
  1127. transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +40 -48
  1128. transformers/models/qwen3_vl/configuration_qwen3_vl.py +16 -19
  1129. transformers/models/qwen3_vl/modeling_qwen3_vl.py +74 -77
  1130. transformers/models/qwen3_vl/modular_qwen3_vl.py +68 -105
  1131. transformers/models/qwen3_vl/processing_qwen3_vl.py +6 -42
  1132. transformers/models/qwen3_vl/video_processing_qwen3_vl.py +10 -12
  1133. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +21 -25
  1134. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +80 -83
  1135. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +33 -36
  1136. transformers/models/rag/configuration_rag.py +0 -1
  1137. transformers/models/rag/modeling_rag.py +116 -118
  1138. transformers/models/rag/retrieval_rag.py +2 -4
  1139. transformers/models/rag/tokenization_rag.py +0 -50
  1140. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +21 -24
  1141. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +31 -34
  1142. transformers/models/reformer/configuration_reformer.py +0 -1
  1143. transformers/models/reformer/modeling_reformer.py +67 -68
  1144. transformers/models/reformer/tokenization_reformer.py +3 -6
  1145. transformers/models/regnet/configuration_regnet.py +0 -1
  1146. transformers/models/regnet/modeling_regnet.py +7 -9
  1147. transformers/models/rembert/configuration_rembert.py +0 -1
  1148. transformers/models/rembert/modeling_rembert.py +108 -110
  1149. transformers/models/rembert/tokenization_rembert.py +1 -4
  1150. transformers/models/resnet/configuration_resnet.py +0 -1
  1151. transformers/models/resnet/modeling_resnet.py +8 -10
  1152. transformers/models/roberta/configuration_roberta.py +0 -1
  1153. transformers/models/roberta/modeling_roberta.py +91 -93
  1154. transformers/models/roberta/modular_roberta.py +55 -58
  1155. transformers/models/roberta/tokenization_roberta.py +2 -5
  1156. transformers/models/roberta/tokenization_roberta_old.py +2 -4
  1157. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +0 -1
  1158. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +91 -93
  1159. transformers/models/roc_bert/configuration_roc_bert.py +0 -1
  1160. transformers/models/roc_bert/modeling_roc_bert.py +119 -121
  1161. transformers/models/roc_bert/tokenization_roc_bert.py +88 -94
  1162. transformers/models/roformer/configuration_roformer.py +0 -1
  1163. transformers/models/roformer/modeling_roformer.py +79 -81
  1164. transformers/models/roformer/tokenization_roformer.py +3 -6
  1165. transformers/models/roformer/tokenization_utils.py +0 -1
  1166. transformers/models/rt_detr/configuration_rt_detr.py +0 -1
  1167. transformers/models/rt_detr/configuration_rt_detr_resnet.py +0 -1
  1168. transformers/models/rt_detr/image_processing_rt_detr.py +54 -55
  1169. transformers/models/rt_detr/image_processing_rt_detr_fast.py +15 -15
  1170. transformers/models/rt_detr/modeling_rt_detr.py +80 -82
  1171. transformers/models/rt_detr/modeling_rt_detr_resnet.py +2 -4
  1172. transformers/models/rt_detr/modular_rt_detr.py +14 -14
  1173. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +0 -1
  1174. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +79 -81
  1175. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +2 -4
  1176. transformers/models/rwkv/configuration_rwkv.py +0 -1
  1177. transformers/models/rwkv/modeling_rwkv.py +29 -31
  1178. transformers/models/sam/configuration_sam.py +0 -1
  1179. transformers/models/sam/image_processing_sam.py +59 -60
  1180. transformers/models/sam/image_processing_sam_fast.py +21 -22
  1181. transformers/models/sam/modeling_sam.py +33 -35
  1182. transformers/models/sam/processing_sam.py +39 -27
  1183. transformers/models/sam2/configuration_sam2.py +0 -1
  1184. transformers/models/sam2/image_processing_sam2_fast.py +14 -15
  1185. transformers/models/sam2/modeling_sam2.py +45 -47
  1186. transformers/models/sam2/modular_sam2.py +43 -44
  1187. transformers/models/sam2/processing_sam2.py +31 -47
  1188. transformers/models/sam2_video/configuration_sam2_video.py +0 -1
  1189. transformers/models/sam2_video/modeling_sam2_video.py +69 -70
  1190. transformers/models/sam2_video/modular_sam2_video.py +60 -79
  1191. transformers/models/sam2_video/processing_sam2_video.py +49 -66
  1192. transformers/models/sam2_video/video_processing_sam2_video.py +1 -4
  1193. transformers/models/sam3/configuration_sam3.py +0 -1
  1194. transformers/models/sam3/image_processing_sam3_fast.py +17 -20
  1195. transformers/models/sam3/modeling_sam3.py +54 -56
  1196. transformers/models/sam3/modular_sam3.py +3 -8
  1197. transformers/models/sam3/processing_sam3.py +29 -48
  1198. transformers/models/sam3_tracker/__init__.py +0 -1
  1199. transformers/models/sam3_tracker/configuration_sam3_tracker.py +0 -1
  1200. transformers/models/sam3_tracker/modeling_sam3_tracker.py +34 -36
  1201. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -1
  1202. transformers/models/sam3_tracker/processing_sam3_tracker.py +31 -47
  1203. transformers/models/sam3_tracker_video/__init__.py +0 -1
  1204. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +0 -1
  1205. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +70 -70
  1206. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +2 -4
  1207. transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +50 -66
  1208. transformers/models/sam3_video/configuration_sam3_video.py +0 -1
  1209. transformers/models/sam3_video/modeling_sam3_video.py +29 -31
  1210. transformers/models/sam3_video/processing_sam3_video.py +25 -45
  1211. transformers/models/sam_hq/__init__.py +1 -1
  1212. transformers/models/sam_hq/configuration_sam_hq.py +0 -1
  1213. transformers/models/sam_hq/modeling_sam_hq.py +39 -41
  1214. transformers/models/sam_hq/modular_sam_hq.py +17 -19
  1215. transformers/models/sam_hq/{processing_samhq.py → processing_sam_hq.py} +39 -28
  1216. transformers/models/seamless_m4t/configuration_seamless_m4t.py +0 -1
  1217. transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +8 -11
  1218. transformers/models/seamless_m4t/modeling_seamless_m4t.py +180 -182
  1219. transformers/models/seamless_m4t/processing_seamless_m4t.py +18 -39
  1220. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +15 -20
  1221. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +0 -1
  1222. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +193 -195
  1223. transformers/models/seed_oss/configuration_seed_oss.py +23 -25
  1224. transformers/models/seed_oss/modeling_seed_oss.py +30 -32
  1225. transformers/models/seed_oss/modular_seed_oss.py +3 -4
  1226. transformers/models/segformer/configuration_segformer.py +0 -10
  1227. transformers/models/segformer/image_processing_segformer.py +39 -42
  1228. transformers/models/segformer/image_processing_segformer_fast.py +7 -8
  1229. transformers/models/segformer/modeling_segformer.py +24 -26
  1230. transformers/models/segformer/modular_segformer.py +5 -6
  1231. transformers/models/seggpt/configuration_seggpt.py +0 -1
  1232. transformers/models/seggpt/image_processing_seggpt.py +38 -41
  1233. transformers/models/seggpt/modeling_seggpt.py +28 -30
  1234. transformers/models/sew/configuration_sew.py +0 -1
  1235. transformers/models/sew/modeling_sew.py +33 -35
  1236. transformers/models/sew/modular_sew.py +10 -12
  1237. transformers/models/sew_d/configuration_sew_d.py +0 -1
  1238. transformers/models/sew_d/modeling_sew_d.py +28 -30
  1239. transformers/models/shieldgemma2/configuration_shieldgemma2.py +0 -1
  1240. transformers/models/shieldgemma2/modeling_shieldgemma2.py +15 -17
  1241. transformers/models/shieldgemma2/processing_shieldgemma2.py +3 -5
  1242. transformers/models/siglip/configuration_siglip.py +0 -1
  1243. transformers/models/siglip/image_processing_siglip.py +17 -20
  1244. transformers/models/siglip/image_processing_siglip_fast.py +0 -1
  1245. transformers/models/siglip/modeling_siglip.py +38 -39
  1246. transformers/models/siglip/processing_siglip.py +2 -14
  1247. transformers/models/siglip/tokenization_siglip.py +6 -7
  1248. transformers/models/siglip2/configuration_siglip2.py +1 -1
  1249. transformers/models/siglip2/image_processing_siglip2.py +15 -16
  1250. transformers/models/siglip2/image_processing_siglip2_fast.py +4 -5
  1251. transformers/models/siglip2/modeling_siglip2.py +54 -54
  1252. transformers/models/siglip2/modular_siglip2.py +23 -25
  1253. transformers/models/siglip2/processing_siglip2.py +2 -14
  1254. transformers/models/smollm3/configuration_smollm3.py +23 -26
  1255. transformers/models/smollm3/modeling_smollm3.py +31 -34
  1256. transformers/models/smollm3/modular_smollm3.py +27 -29
  1257. transformers/models/smolvlm/configuration_smolvlm.py +1 -1
  1258. transformers/models/smolvlm/image_processing_smolvlm.py +42 -43
  1259. transformers/models/smolvlm/image_processing_smolvlm_fast.py +12 -12
  1260. transformers/models/smolvlm/modeling_smolvlm.py +51 -52
  1261. transformers/models/smolvlm/modular_smolvlm.py +15 -17
  1262. transformers/models/smolvlm/processing_smolvlm.py +15 -76
  1263. transformers/models/smolvlm/video_processing_smolvlm.py +7 -8
  1264. transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +0 -1
  1265. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +20 -23
  1266. transformers/models/speech_to_text/configuration_speech_to_text.py +0 -1
  1267. transformers/models/speech_to_text/feature_extraction_speech_to_text.py +10 -13
  1268. transformers/models/speech_to_text/modeling_speech_to_text.py +52 -54
  1269. transformers/models/speech_to_text/processing_speech_to_text.py +4 -30
  1270. transformers/models/speech_to_text/tokenization_speech_to_text.py +5 -6
  1271. transformers/models/speecht5/configuration_speecht5.py +0 -1
  1272. transformers/models/speecht5/feature_extraction_speecht5.py +16 -37
  1273. transformers/models/speecht5/modeling_speecht5.py +172 -174
  1274. transformers/models/speecht5/number_normalizer.py +0 -1
  1275. transformers/models/speecht5/processing_speecht5.py +3 -37
  1276. transformers/models/speecht5/tokenization_speecht5.py +4 -5
  1277. transformers/models/splinter/configuration_splinter.py +0 -1
  1278. transformers/models/splinter/modeling_splinter.py +54 -56
  1279. transformers/models/splinter/tokenization_splinter.py +2 -4
  1280. transformers/models/squeezebert/configuration_squeezebert.py +0 -1
  1281. transformers/models/squeezebert/modeling_squeezebert.py +60 -62
  1282. transformers/models/squeezebert/tokenization_squeezebert.py +0 -1
  1283. transformers/models/stablelm/configuration_stablelm.py +20 -23
  1284. transformers/models/stablelm/modeling_stablelm.py +39 -42
  1285. transformers/models/starcoder2/configuration_starcoder2.py +19 -22
  1286. transformers/models/starcoder2/modeling_starcoder2.py +33 -36
  1287. transformers/models/starcoder2/modular_starcoder2.py +13 -15
  1288. transformers/models/superglue/configuration_superglue.py +3 -3
  1289. transformers/models/superglue/image_processing_superglue.py +15 -15
  1290. transformers/models/superglue/image_processing_superglue_fast.py +4 -5
  1291. transformers/models/superglue/modeling_superglue.py +32 -33
  1292. transformers/models/superpoint/image_processing_superpoint.py +15 -15
  1293. transformers/models/superpoint/image_processing_superpoint_fast.py +4 -5
  1294. transformers/models/superpoint/modeling_superpoint.py +13 -14
  1295. transformers/models/swiftformer/configuration_swiftformer.py +0 -1
  1296. transformers/models/swiftformer/modeling_swiftformer.py +12 -14
  1297. transformers/models/swin/configuration_swin.py +0 -1
  1298. transformers/models/swin/modeling_swin.py +58 -70
  1299. transformers/models/swin2sr/configuration_swin2sr.py +0 -1
  1300. transformers/models/swin2sr/image_processing_swin2sr.py +10 -13
  1301. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -5
  1302. transformers/models/swin2sr/modeling_swin2sr.py +26 -28
  1303. transformers/models/swinv2/configuration_swinv2.py +0 -1
  1304. transformers/models/swinv2/modeling_swinv2.py +55 -67
  1305. transformers/models/switch_transformers/configuration_switch_transformers.py +0 -1
  1306. transformers/models/switch_transformers/modeling_switch_transformers.py +32 -33
  1307. transformers/models/switch_transformers/modular_switch_transformers.py +29 -30
  1308. transformers/models/t5/configuration_t5.py +0 -1
  1309. transformers/models/t5/modeling_t5.py +75 -77
  1310. transformers/models/t5/tokenization_t5.py +1 -3
  1311. transformers/models/t5gemma/configuration_t5gemma.py +33 -34
  1312. transformers/models/t5gemma/modeling_t5gemma.py +96 -99
  1313. transformers/models/t5gemma/modular_t5gemma.py +117 -118
  1314. transformers/models/t5gemma2/configuration_t5gemma2.py +53 -54
  1315. transformers/models/t5gemma2/modeling_t5gemma2.py +96 -99
  1316. transformers/models/t5gemma2/modular_t5gemma2.py +134 -135
  1317. transformers/models/table_transformer/configuration_table_transformer.py +0 -1
  1318. transformers/models/table_transformer/modeling_table_transformer.py +46 -48
  1319. transformers/models/tapas/configuration_tapas.py +0 -1
  1320. transformers/models/tapas/modeling_tapas.py +64 -66
  1321. transformers/models/tapas/tokenization_tapas.py +115 -153
  1322. transformers/models/textnet/configuration_textnet.py +0 -1
  1323. transformers/models/textnet/image_processing_textnet.py +22 -25
  1324. transformers/models/textnet/image_processing_textnet_fast.py +5 -6
  1325. transformers/models/textnet/modeling_textnet.py +13 -14
  1326. transformers/models/time_series_transformer/configuration_time_series_transformer.py +5 -8
  1327. transformers/models/time_series_transformer/modeling_time_series_transformer.py +79 -81
  1328. transformers/models/timesfm/configuration_timesfm.py +0 -1
  1329. transformers/models/timesfm/modeling_timesfm.py +17 -19
  1330. transformers/models/timesfm/modular_timesfm.py +16 -18
  1331. transformers/models/timesformer/configuration_timesformer.py +0 -1
  1332. transformers/models/timesformer/modeling_timesformer.py +13 -16
  1333. transformers/models/timm_backbone/configuration_timm_backbone.py +0 -1
  1334. transformers/models/timm_backbone/modeling_timm_backbone.py +4 -6
  1335. transformers/models/timm_wrapper/configuration_timm_wrapper.py +2 -3
  1336. transformers/models/timm_wrapper/image_processing_timm_wrapper.py +4 -5
  1337. transformers/models/timm_wrapper/modeling_timm_wrapper.py +13 -15
  1338. transformers/models/trocr/configuration_trocr.py +0 -1
  1339. transformers/models/trocr/modeling_trocr.py +38 -40
  1340. transformers/models/trocr/processing_trocr.py +5 -25
  1341. transformers/models/tvp/configuration_tvp.py +0 -1
  1342. transformers/models/tvp/image_processing_tvp.py +50 -52
  1343. transformers/models/tvp/image_processing_tvp_fast.py +9 -10
  1344. transformers/models/tvp/modeling_tvp.py +25 -27
  1345. transformers/models/tvp/processing_tvp.py +2 -14
  1346. transformers/models/udop/configuration_udop.py +0 -1
  1347. transformers/models/udop/modeling_udop.py +63 -66
  1348. transformers/models/udop/processing_udop.py +7 -26
  1349. transformers/models/udop/tokenization_udop.py +80 -93
  1350. transformers/models/umt5/configuration_umt5.py +0 -1
  1351. transformers/models/umt5/modeling_umt5.py +80 -81
  1352. transformers/models/unispeech/configuration_unispeech.py +0 -1
  1353. transformers/models/unispeech/modeling_unispeech.py +47 -49
  1354. transformers/models/unispeech/modular_unispeech.py +20 -22
  1355. transformers/models/unispeech_sat/configuration_unispeech_sat.py +0 -1
  1356. transformers/models/unispeech_sat/modeling_unispeech_sat.py +63 -65
  1357. transformers/models/unispeech_sat/modular_unispeech_sat.py +21 -23
  1358. transformers/models/univnet/feature_extraction_univnet.py +14 -14
  1359. transformers/models/univnet/modeling_univnet.py +7 -8
  1360. transformers/models/upernet/configuration_upernet.py +0 -1
  1361. transformers/models/upernet/modeling_upernet.py +10 -13
  1362. transformers/models/vaultgemma/__init__.py +0 -1
  1363. transformers/models/vaultgemma/configuration_vaultgemma.py +24 -26
  1364. transformers/models/vaultgemma/modeling_vaultgemma.py +34 -36
  1365. transformers/models/vaultgemma/modular_vaultgemma.py +29 -31
  1366. transformers/models/video_llama_3/image_processing_video_llama_3.py +40 -40
  1367. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +8 -8
  1368. transformers/models/video_llama_3/modeling_video_llama_3.py +66 -66
  1369. transformers/models/video_llama_3/modular_video_llama_3.py +101 -112
  1370. transformers/models/video_llama_3/processing_video_llama_3.py +5 -39
  1371. transformers/models/video_llama_3/video_processing_video_llama_3.py +18 -18
  1372. transformers/models/video_llava/configuration_video_llava.py +0 -1
  1373. transformers/models/video_llava/image_processing_video_llava.py +35 -38
  1374. transformers/models/video_llava/modeling_video_llava.py +52 -54
  1375. transformers/models/video_llava/processing_video_llava.py +38 -78
  1376. transformers/models/video_llava/video_processing_video_llava.py +0 -1
  1377. transformers/models/videomae/configuration_videomae.py +0 -1
  1378. transformers/models/videomae/image_processing_videomae.py +31 -34
  1379. transformers/models/videomae/modeling_videomae.py +13 -15
  1380. transformers/models/videomae/video_processing_videomae.py +0 -1
  1381. transformers/models/vilt/configuration_vilt.py +0 -1
  1382. transformers/models/vilt/image_processing_vilt.py +29 -30
  1383. transformers/models/vilt/image_processing_vilt_fast.py +9 -10
  1384. transformers/models/vilt/modeling_vilt.py +76 -78
  1385. transformers/models/vilt/processing_vilt.py +2 -14
  1386. transformers/models/vipllava/configuration_vipllava.py +0 -1
  1387. transformers/models/vipllava/modeling_vipllava.py +38 -39
  1388. transformers/models/vipllava/modular_vipllava.py +30 -32
  1389. transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +0 -1
  1390. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +18 -21
  1391. transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +0 -1
  1392. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +18 -21
  1393. transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +2 -16
  1394. transformers/models/visual_bert/configuration_visual_bert.py +0 -1
  1395. transformers/models/visual_bert/modeling_visual_bert.py +90 -92
  1396. transformers/models/vit/configuration_vit.py +0 -1
  1397. transformers/models/vit/image_processing_vit.py +19 -22
  1398. transformers/models/vit/image_processing_vit_fast.py +0 -1
  1399. transformers/models/vit/modeling_vit.py +13 -15
  1400. transformers/models/vit_mae/configuration_vit_mae.py +0 -1
  1401. transformers/models/vit_mae/modeling_vit_mae.py +21 -23
  1402. transformers/models/vit_msn/configuration_vit_msn.py +0 -1
  1403. transformers/models/vit_msn/modeling_vit_msn.py +10 -12
  1404. transformers/models/vitdet/configuration_vitdet.py +0 -1
  1405. transformers/models/vitdet/modeling_vitdet.py +12 -14
  1406. transformers/models/vitmatte/configuration_vitmatte.py +1 -4
  1407. transformers/models/vitmatte/image_processing_vitmatte.py +15 -18
  1408. transformers/models/vitmatte/image_processing_vitmatte_fast.py +14 -15
  1409. transformers/models/vitmatte/modeling_vitmatte.py +9 -11
  1410. transformers/models/vitpose/configuration_vitpose.py +3 -6
  1411. transformers/models/vitpose/image_processing_vitpose.py +24 -25
  1412. transformers/models/vitpose/image_processing_vitpose_fast.py +9 -10
  1413. transformers/models/vitpose/modeling_vitpose.py +10 -12
  1414. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +0 -1
  1415. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +8 -10
  1416. transformers/models/vits/configuration_vits.py +0 -1
  1417. transformers/models/vits/modeling_vits.py +34 -35
  1418. transformers/models/vits/tokenization_vits.py +3 -4
  1419. transformers/models/vivit/configuration_vivit.py +0 -1
  1420. transformers/models/vivit/image_processing_vivit.py +36 -39
  1421. transformers/models/vivit/modeling_vivit.py +5 -7
  1422. transformers/models/vjepa2/__init__.py +0 -1
  1423. transformers/models/vjepa2/configuration_vjepa2.py +0 -1
  1424. transformers/models/vjepa2/modeling_vjepa2.py +30 -32
  1425. transformers/models/vjepa2/video_processing_vjepa2.py +0 -1
  1426. transformers/models/voxtral/__init__.py +0 -1
  1427. transformers/models/voxtral/configuration_voxtral.py +0 -1
  1428. transformers/models/voxtral/modeling_voxtral.py +17 -25
  1429. transformers/models/voxtral/modular_voxtral.py +10 -19
  1430. transformers/models/voxtral/processing_voxtral.py +25 -48
  1431. transformers/models/wav2vec2/configuration_wav2vec2.py +0 -1
  1432. transformers/models/wav2vec2/feature_extraction_wav2vec2.py +7 -10
  1433. transformers/models/wav2vec2/modeling_wav2vec2.py +67 -122
  1434. transformers/models/wav2vec2/processing_wav2vec2.py +6 -35
  1435. transformers/models/wav2vec2/tokenization_wav2vec2.py +20 -332
  1436. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +0 -1
  1437. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +49 -52
  1438. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +45 -48
  1439. transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +6 -35
  1440. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +0 -1
  1441. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +62 -65
  1442. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +15 -18
  1443. transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +16 -17
  1444. transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +36 -55
  1445. transformers/models/wavlm/configuration_wavlm.py +0 -1
  1446. transformers/models/wavlm/modeling_wavlm.py +45 -48
  1447. transformers/models/wavlm/modular_wavlm.py +4 -5
  1448. transformers/models/whisper/configuration_whisper.py +0 -1
  1449. transformers/models/whisper/english_normalizer.py +3 -4
  1450. transformers/models/whisper/feature_extraction_whisper.py +9 -24
  1451. transformers/models/whisper/generation_whisper.py +26 -48
  1452. transformers/models/whisper/modeling_whisper.py +68 -70
  1453. transformers/models/whisper/processing_whisper.py +3 -20
  1454. transformers/models/whisper/tokenization_whisper.py +9 -30
  1455. transformers/models/x_clip/configuration_x_clip.py +0 -1
  1456. transformers/models/x_clip/modeling_x_clip.py +68 -69
  1457. transformers/models/x_clip/processing_x_clip.py +2 -14
  1458. transformers/models/xcodec/configuration_xcodec.py +4 -6
  1459. transformers/models/xcodec/modeling_xcodec.py +15 -17
  1460. transformers/models/xglm/configuration_xglm.py +0 -1
  1461. transformers/models/xglm/modeling_xglm.py +49 -55
  1462. transformers/models/xglm/tokenization_xglm.py +1 -4
  1463. transformers/models/xlm/configuration_xlm.py +0 -1
  1464. transformers/models/xlm/modeling_xlm.py +126 -130
  1465. transformers/models/xlm/tokenization_xlm.py +3 -5
  1466. transformers/models/xlm_roberta/configuration_xlm_roberta.py +0 -1
  1467. transformers/models/xlm_roberta/modeling_xlm_roberta.py +90 -92
  1468. transformers/models/xlm_roberta/modular_xlm_roberta.py +50 -53
  1469. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +1 -4
  1470. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +0 -1
  1471. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +91 -93
  1472. transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +67 -70
  1473. transformers/models/xlnet/configuration_xlnet.py +0 -11
  1474. transformers/models/xlnet/modeling_xlnet.py +149 -162
  1475. transformers/models/xlnet/tokenization_xlnet.py +1 -4
  1476. transformers/models/xlstm/configuration_xlstm.py +3 -5
  1477. transformers/models/xlstm/modeling_xlstm.py +62 -65
  1478. transformers/models/xmod/configuration_xmod.py +0 -1
  1479. transformers/models/xmod/modeling_xmod.py +98 -100
  1480. transformers/models/yolos/configuration_yolos.py +0 -1
  1481. transformers/models/yolos/image_processing_yolos.py +60 -62
  1482. transformers/models/yolos/image_processing_yolos_fast.py +18 -18
  1483. transformers/models/yolos/modeling_yolos.py +12 -14
  1484. transformers/models/yolos/modular_yolos.py +2 -4
  1485. transformers/models/yoso/configuration_yoso.py +0 -1
  1486. transformers/models/yoso/modeling_yoso.py +60 -62
  1487. transformers/models/zamba/configuration_zamba.py +0 -1
  1488. transformers/models/zamba/modeling_zamba.py +68 -69
  1489. transformers/models/zamba2/configuration_zamba2.py +36 -37
  1490. transformers/models/zamba2/modeling_zamba2.py +84 -87
  1491. transformers/models/zamba2/modular_zamba2.py +43 -45
  1492. transformers/models/zoedepth/configuration_zoedepth.py +0 -1
  1493. transformers/models/zoedepth/image_processing_zoedepth.py +28 -29
  1494. transformers/models/zoedepth/image_processing_zoedepth_fast.py +11 -12
  1495. transformers/models/zoedepth/modeling_zoedepth.py +14 -16
  1496. transformers/pipelines/__init__.py +50 -49
  1497. transformers/pipelines/any_to_any.py +14 -22
  1498. transformers/pipelines/audio_utils.py +1 -2
  1499. transformers/pipelines/base.py +12 -16
  1500. transformers/pipelines/deprecated/__init__.py +0 -1
  1501. transformers/pipelines/image_text_to_text.py +0 -1
  1502. transformers/pipelines/image_to_text.py +4 -44
  1503. transformers/pipelines/question_answering.py +4 -43
  1504. transformers/pipelines/text_classification.py +1 -14
  1505. transformers/pipelines/token_classification.py +1 -22
  1506. transformers/pipelines/video_classification.py +1 -9
  1507. transformers/pipelines/zero_shot_audio_classification.py +0 -1
  1508. transformers/pipelines/zero_shot_classification.py +0 -6
  1509. transformers/pipelines/zero_shot_image_classification.py +0 -7
  1510. transformers/processing_utils.py +95 -95
  1511. transformers/quantizers/base.py +10 -0
  1512. transformers/quantizers/quantizer_quark.py +0 -1
  1513. transformers/quantizers/quantizer_torchao.py +3 -3
  1514. transformers/testing_utils.py +3 -37
  1515. transformers/tokenization_mistral_common.py +554 -903
  1516. transformers/tokenization_utils_base.py +109 -122
  1517. transformers/tokenization_utils_sentencepiece.py +5 -6
  1518. transformers/tokenization_utils_tokenizers.py +5 -5
  1519. transformers/trainer.py +6 -9
  1520. transformers/trainer_jit_checkpoint.py +1 -2
  1521. transformers/training_args.py +3 -3
  1522. transformers/utils/attention_visualizer.py +1 -1
  1523. transformers/utils/auto_docstring.py +564 -12
  1524. transformers/utils/doc.py +1 -1
  1525. transformers/utils/dummy_pt_objects.py +0 -42
  1526. transformers/utils/generic.py +1 -1
  1527. transformers/utils/loading_report.py +3 -3
  1528. transformers/utils/quantization_config.py +8 -10
  1529. transformers/video_processing_utils.py +19 -20
  1530. transformers/video_utils.py +18 -22
  1531. {transformers-5.0.0rc2.dist-info → transformers-5.0.0rc3.dist-info}/METADATA +19 -19
  1532. transformers-5.0.0rc3.dist-info/RECORD +2067 -0
  1533. transformers-5.0.0rc2.dist-info/RECORD +0 -2042
  1534. {transformers-5.0.0rc2.dist-info → transformers-5.0.0rc3.dist-info}/WHEEL +0 -0
  1535. {transformers-5.0.0rc2.dist-info → transformers-5.0.0rc3.dist-info}/entry_points.txt +0 -0
  1536. {transformers-5.0.0rc2.dist-info → transformers-5.0.0rc3.dist-info}/licenses/LICENSE +0 -0
  1537. {transformers-5.0.0rc2.dist-info → transformers-5.0.0rc3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1611 @@
1
+ # Copyright 2026 The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import math
15
+ from collections.abc import Callable
16
+ from dataclasses import dataclass
17
+ from typing import Any
18
+
19
+ import torch
20
+ from torch import nn
21
+
22
+ from ... import initialization as init
23
+ from ...activations import ACT2FN
24
+ from ...configuration_utils import PreTrainedConfig
25
+ from ...modeling_layers import GradientCheckpointingLayer
26
+ from ...modeling_outputs import BackboneOutput
27
+ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
28
+ from ...processing_utils import Unpack
29
+ from ...pytorch_utils import meshgrid
30
+ from ...utils import ModelOutput, TransformersKwargs, auto_docstring, logging
31
+ from ...utils.generic import check_model_inputs
32
+ from ..auto.configuration_auto import AutoConfig
33
+ from ..convnext.modeling_convnext import ConvNextLayerNorm
34
+ from ..dab_detr.modeling_dab_detr import gen_sine_position_embeddings
35
+ from ..deformable_detr.modeling_deformable_detr import (
36
+ DeformableDetrDecoderOutput,
37
+ DeformableDetrForObjectDetection,
38
+ DeformableDetrMLPPredictionHead,
39
+ DeformableDetrModel,
40
+ DeformableDetrMultiscaleDeformableAttention,
41
+ )
42
+ from ..llama.modeling_llama import eager_attention_forward
43
+ from ..rt_detr.configuration_rt_detr import CONFIG_MAPPING
44
+ from ..rt_detr.modeling_rt_detr import RTDetrConvNormLayer
45
+ from ..vit.modeling_vit import ViTAttention, ViTEncoder, ViTSelfAttention
46
+ from ..vitdet.configuration_vitdet import VitDetConfig
47
+ from ..vitdet.modeling_vitdet import (
48
+ VitDetBackbone,
49
+ VitDetEmbeddings,
50
+ VitDetMlp,
51
+ VitDetPreTrainedModel,
52
+ )
53
+
54
+
55
+ logger = logging.get_logger(__name__)
56
+
57
+
58
+ class LwDetrViTConfig(VitDetConfig):
59
+ r"""
60
+ This is the configuration class to store the configuration of a [`LwDetrViTModel`]. It is used to instantiate an
61
+ LW-DETR ViT model according to the specified arguments, defining the model architecture. Instantiating a configuration
62
+ with the defaults will yield a similar configuration to that of the LW-DETR ViT
63
+ [stevenbucaille/lwdetr_small_60e_coco](https://huggingface.co/stevenbucaille/lwdetr_small_60e_coco) architecture.
64
+
65
+ LW-DETR ViT is the Vision Transformer backbone used in the LW-DETR model for real-time object detection. It features
66
+ interleaved window and global attention mechanisms to reduce computational complexity while maintaining high performance.
67
+ The model uses a window-major feature map organization for efficient attention computation.
68
+
69
+ Configuration objects inherit from [`VitDetConfig`] and can be used to control the model outputs. Read the
70
+ documentation from [`VitDetConfig`] for more information.
71
+
72
+ Args:
73
+ hidden_size (`int`, *optional*, defaults to 768):
74
+ Dimensionality of the encoder layers and the pooler layer.
75
+ num_hidden_layers (`int`, *optional*, defaults to 12):
76
+ Number of hidden layers in the Transformer encoder.
77
+ num_attention_heads (`int`, *optional*, defaults to 12):
78
+ Number of attention heads for each attention layer in the Transformer encoder.
79
+ mlp_ratio (`int`, *optional*, defaults to 4):
80
+ Ratio of mlp hidden dim to embedding dim.
81
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
82
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
83
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
84
+ dropout_prob (`float`, *optional*, defaults to 0.0):
85
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
86
+ initializer_range (`float`, *optional*, defaults to 0.02):
87
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
88
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
89
+ The epsilon used by the layer normalization layers.
90
+ image_size (`int`, *optional*, defaults to 256):
91
+ The size (resolution) of each image.
92
+ pretrain_image_size (`int`, *optional*, defaults to 224):
93
+ The size (resolution) of each image during pretraining.
94
+ patch_size (`int`, *optional*, defaults to 16):
95
+ The size (resolution) of each patch.
96
+ num_channels (`int`, *optional*, defaults to 3):
97
+ The number of input channels.
98
+ qkv_bias (`bool`, *optional*, defaults to `True`):
99
+ Whether to add a bias to the queries, keys and values.
100
+ window_block_indices (`list[int]`, *optional*, defaults to `[]`):
101
+ List of indices of blocks that should have window attention instead of regular global self-attention.
102
+ use_absolute_position_embeddings (`bool`, *optional*, defaults to `True`):
103
+ Whether to add absolute position embeddings to the patch embeddings.
104
+ out_features (`list[str]`, *optional*):
105
+ If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
106
+ (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
107
+ corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
108
+ same order as defined in the `stage_names` attribute.
109
+ out_indices (`list[int]`, *optional*):
110
+ If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
111
+ many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
112
+ If unset and `out_features` is unset, will default to the last stage. Must be in the
113
+ same order as defined in the `stage_names` attribute.
114
+ cae_init_values (`float`, *optional*, defaults to 0.1):
115
+ Initialization value for CAE parameters when `use_cae` is enabled.
116
+ num_windows (`int`, *optional*, defaults to 16):
117
+ Number of windows for window-based attention. Must be a perfect square and the image size must be
118
+ divisible by the square root of this value. This enables efficient window-major feature map organization.
119
+
120
+ Example:
121
+
122
+ ```python
123
+ >>> from transformers import LwDetrViTConfig, LwDetrViTModel
124
+
125
+ >>> # Initializing a LW-DETR ViT configuration
126
+ >>> configuration = LwDetrViTConfig()
127
+
128
+ >>> # Initializing a model (with random weights) from the configuration
129
+ >>> model = LwDetrViTModel(configuration)
130
+
131
+ >>> # Accessing the model configuration
132
+ >>> configuration = model.config
133
+ ```"""
134
+
135
+ model_type = "lw_detr_vit"
136
+
137
+ def __init__(
138
+ self,
139
+ hidden_size=768,
140
+ num_hidden_layers=12,
141
+ num_attention_heads=12,
142
+ mlp_ratio=4,
143
+ hidden_act="gelu",
144
+ dropout_prob=0.0,
145
+ initializer_range=0.02,
146
+ layer_norm_eps=1e-6,
147
+ image_size=256,
148
+ pretrain_image_size=224,
149
+ patch_size=16,
150
+ num_channels=3,
151
+ qkv_bias=True,
152
+ window_block_indices=[],
153
+ use_absolute_position_embeddings=True,
154
+ out_features=None,
155
+ out_indices=None,
156
+ cae_init_values: float = 0.1,
157
+ num_windows=16,
158
+ **kwargs,
159
+ ):
160
+ super().__init__(
161
+ hidden_size=hidden_size,
162
+ num_hidden_layers=num_hidden_layers,
163
+ num_attention_heads=num_attention_heads,
164
+ mlp_ratio=mlp_ratio,
165
+ hidden_act=hidden_act,
166
+ dropout_prob=dropout_prob,
167
+ initializer_range=initializer_range,
168
+ layer_norm_eps=layer_norm_eps,
169
+ image_size=image_size,
170
+ pretrain_image_size=pretrain_image_size,
171
+ patch_size=patch_size,
172
+ num_channels=num_channels,
173
+ qkv_bias=qkv_bias,
174
+ window_block_indices=window_block_indices,
175
+ use_absolute_position_embeddings=use_absolute_position_embeddings,
176
+ out_features=out_features,
177
+ out_indices=out_indices,
178
+ **kwargs,
179
+ )
180
+ del self.residual_block_indices
181
+ del self.use_relative_position_embeddings
182
+ del self.window_size
183
+ del self.drop_path_rate
184
+
185
+ self.cae_init_values = cae_init_values
186
+ if num_windows % math.sqrt(num_windows) != 0:
187
+ raise ValueError(
188
+ f"`num_windows` has to be a perfect square, where num_windows % math.sqrt(num_windows) != 0, but got {num_windows}."
189
+ )
190
+ if image_size / num_windows % math.sqrt(num_windows) != 0:
191
+ raise ValueError(
192
+ f"`image_size` has to be divisible by `num_windows`, where image_size / num_windows % math.sqrt(num_windows) != 0,but got {image_size} and {num_windows}."
193
+ )
194
+ self.num_windows = num_windows
195
+ self.num_windows_side = int(math.sqrt(num_windows))
196
+
197
+
198
+ class LwDetrConfig(PreTrainedConfig):
199
+ r"""
200
+ This is the configuration class to store the configuration of a [`LwDetrModel`]. It is used to instantiate
201
+ a LW-DETR model according to the specified arguments, defining the model architecture. Instantiating a
202
+ configuration with the defaults will yield a similar configuration to that of the LW-DETR
203
+ [stevenbucaille/lwdetr_small_60e_coco](https://huggingface.co/stevenbucaille/lwdetr_small_60e_coco) architecture.
204
+
205
+ LW-DETR (Lightweight Detection Transformer) is a transformer-based object detection model designed for real-time
206
+ detection tasks. It replaces traditional CNN-based detectors like YOLO with a more efficient transformer architecture
207
+ that achieves competitive performance while being computationally lightweight.
208
+
209
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
210
+ documentation from [`PretrainedConfig`] for more information.
211
+
212
+ Args:
213
+ backbone_config (`PretrainedConfig` or `dict`, *optional*):
214
+ The configuration of the backbone model. If not provided, will default to `LwDetrViTConfig` with
215
+ a small ViT architecture optimized for detection tasks.
216
+ projector_scale_factors (`list[float]`, *optional*, defaults to `[]`):
217
+ Scale factors for the feature pyramid network. Each scale factor determines the resolution of features
218
+ at different levels. Supported values are 0.5, 1.0, and 2.0.
219
+ hidden_expansion (`float`, *optional*, defaults to 0.5):
220
+ Expansion factor for hidden dimensions in the projector layers.
221
+ c2f_num_blocks (`int`, *optional*, defaults to 3):
222
+ Number of blocks in the C2F layer.
223
+ activation_function (`str`, *optional*, defaults to `"silu"`):
224
+ The non-linear activation function in the projector. Supported values are `"silu"`, `"relu"`, `"gelu"`.
225
+ batch_norm_eps (`float`, *optional*, defaults to 1e-05):
226
+ The epsilon value for batch normalization layers.
227
+ d_model (`int`, *optional*, defaults to 256):
228
+ Dimension of the model layers and the number of expected features in the decoder inputs.
229
+ dropout (`float`, *optional*, defaults to 0.1):
230
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
231
+ decoder_ffn_dim (`int`, *optional*, defaults to 2048):
232
+ Dimension of the "intermediate" (often named feed-forward) layer in decoder.
233
+ decoder_n_points (`int`, *optional*, defaults to 4):
234
+ The number of sampled keys in each feature level for each attention head in the decoder.
235
+ decoder_layers (`int`, *optional*, defaults to 3):
236
+ Number of decoder layers in the transformer.
237
+ decoder_self_attention_heads (`int`, *optional*, defaults to 8):
238
+ Number of attention heads for each attention layer in the decoder self-attention.
239
+ decoder_cross_attention_heads (`int`, *optional*, defaults to 16):
240
+ Number of attention heads for each attention layer in the decoder cross-attention.
241
+ decoder_activation_function (`str`, *optional*, defaults to `"relu"`):
242
+ The non-linear activation function in the decoder. Supported values are `"relu"`, `"silu"`, `"gelu"`.
243
+ num_queries (`int`, *optional*, defaults to 300):
244
+ Number of object queries, i.e. detection slots. This is the maximal number of objects
245
+ [`LwDetrModel`] can detect in a single image.
246
+ attention_bias (`bool`, *optional*, defaults to `True`):
247
+ Whether to add bias to the attention layers.
248
+ attention_dropout (`float`, *optional*, defaults to 0.0):
249
+ The dropout ratio for the attention probabilities.
250
+ activation_dropout (`float`, *optional*, defaults to 0.0):
251
+ The dropout ratio for activations inside the fully connected layer.
252
+ group_detr (`int`, *optional*, defaults to 13):
253
+ Number of groups for Group DETR attention mechanism, which helps reduce computational complexity.
254
+ init_std (`float`, *optional*, defaults to 0.02):
255
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
256
+ disable_custom_kernels (`bool`, *optional*, defaults to `True`):
257
+ Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
258
+ kernels are not supported by PyTorch ONNX export.
259
+ class_cost (`float`, *optional*, defaults to 2):
260
+ Relative weight of the classification error in the Hungarian matching cost.
261
+ bbox_cost (`float`, *optional*, defaults to 5):
262
+ Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
263
+ giou_cost (`float`, *optional*, defaults to 2):
264
+ Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
265
+ mask_loss_coefficient (`float`, *optional*, defaults to 1):
266
+ Relative weight of the Focal loss in the panoptic segmentation loss.
267
+ dice_loss_coefficient (`float`, *optional*, defaults to 1):
268
+ Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
269
+ bbox_loss_coefficient (`float`, *optional*, defaults to 5):
270
+ Relative weight of the L1 bounding box loss in the object detection loss.
271
+ giou_loss_coefficient (`float`, *optional*, defaults to 2):
272
+ Relative weight of the generalized IoU loss in the object detection loss.
273
+ eos_coefficient (`float`, *optional*, defaults to 0.1):
274
+ Relative classification weight of the 'no-object' class in the object detection loss.
275
+ focal_alpha (`float`, *optional*, defaults to 0.25):
276
+ Alpha parameter in the focal loss.
277
+ auxiliary_loss (`bool`, *optional*, defaults to `True`):
278
+ Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
279
+
280
+ Examples:
281
+
282
+ ```python
283
+ >>> from transformers import LwDetrConfig, LwDetrModel
284
+
285
+ >>> # Initializing a LW-DETR stevenbucaille/lwdetr_small_60e_coco style configuration
286
+ >>> configuration = LwDetrConfig()
287
+
288
+ >>> # Initializing a model (with random weights) from the stevenbucaille/lwdetr_small_60e_coco style configuration
289
+ >>> model = LwDetrModel(configuration)
290
+
291
+ >>> # Accessing the model configuration
292
+ >>> configuration = model.config
293
+ ```"""
294
+
295
+ model_type = "lw_detr"
296
+ sub_configs = {"backbone_config": AutoConfig}
297
+
298
+ def __init__(
299
+ self,
300
+ # backbone
301
+ backbone_config=None,
302
+ # projector
303
+ projector_scale_factors: list[float] = [],
304
+ hidden_expansion=0.5,
305
+ c2f_num_blocks=3,
306
+ activation_function="silu",
307
+ batch_norm_eps=1e-5,
308
+ # decoder
309
+ d_model=256,
310
+ dropout=0.1,
311
+ decoder_ffn_dim=2048,
312
+ decoder_n_points=4,
313
+ decoder_layers: int = 3,
314
+ decoder_self_attention_heads: int = 8,
315
+ decoder_cross_attention_heads: int = 16,
316
+ decoder_activation_function="relu",
317
+ # model
318
+ num_queries=300,
319
+ attention_bias=True,
320
+ attention_dropout=0.0,
321
+ activation_dropout=0.0,
322
+ group_detr: int = 13,
323
+ init_std=0.02,
324
+ disable_custom_kernels=True,
325
+ # loss
326
+ class_cost=2,
327
+ bbox_cost=5,
328
+ giou_cost=2,
329
+ mask_loss_coefficient=1,
330
+ dice_loss_coefficient=1,
331
+ bbox_loss_coefficient=5,
332
+ giou_loss_coefficient=2,
333
+ eos_coefficient=0.1,
334
+ focal_alpha=0.25,
335
+ auxiliary_loss=True,
336
+ **kwargs,
337
+ ):
338
+ self.batch_norm_eps = batch_norm_eps
339
+
340
+ # backbone
341
+ if backbone_config is None:
342
+ logger.info(
343
+ "`backbone_config` and `backbone` are `None`. Initializing the config with the default `LwDetrViT` backbone."
344
+ )
345
+ backbone_config = LwDetrViTConfig(
346
+ image_size=1024,
347
+ hidden_size=192,
348
+ num_hidden_layers=10,
349
+ num_attention_heads=12,
350
+ window_block_indices=[0, 1, 3, 6, 7, 9],
351
+ out_indices=[2, 4, 5, 9],
352
+ **kwargs,
353
+ )
354
+ elif isinstance(backbone_config, dict):
355
+ backbone_model_type = backbone_config.pop("model_type")
356
+ config_class = CONFIG_MAPPING[backbone_model_type]
357
+ backbone_config = config_class.from_dict(backbone_config)
358
+
359
+ self.backbone_config = backbone_config
360
+ # projector
361
+ self.projector_scale_factors = projector_scale_factors
362
+ for scale in projector_scale_factors:
363
+ if scale not in [0.5, 1.0, 2.0]:
364
+ raise ValueError(f"Unsupported scale factor: {scale}")
365
+ self.projector_in_channels = [d_model] * len(projector_scale_factors)
366
+ self.projector_out_channels = d_model
367
+ self.activation_function = activation_function
368
+ self.hidden_expansion = hidden_expansion
369
+ self.c2f_num_blocks = c2f_num_blocks
370
+ # decoder
371
+ self.d_model = d_model
372
+ self.dropout = dropout
373
+ self.num_queries = num_queries
374
+ self.decoder_ffn_dim = decoder_ffn_dim
375
+ self.num_feature_levels = len(self.projector_scale_factors)
376
+ self.decoder_n_points = decoder_n_points
377
+ self.decoder_layers = decoder_layers
378
+ self.decoder_activation_function = decoder_activation_function
379
+ self.decoder_self_attention_heads = decoder_self_attention_heads
380
+ self.decoder_cross_attention_heads = decoder_cross_attention_heads
381
+ self.attention_bias = attention_bias
382
+ self.attention_dropout = attention_dropout
383
+ self.activation_dropout = activation_dropout
384
+ # model
385
+ self.init_std = init_std
386
+ self.group_detr = group_detr
387
+ # Loss
388
+ self.auxiliary_loss = auxiliary_loss
389
+ # Hungarian matcher
390
+ self.class_cost = class_cost
391
+ self.bbox_cost = bbox_cost
392
+ self.giou_cost = giou_cost
393
+ # Loss coefficients
394
+ self.dice_loss_coefficient = dice_loss_coefficient
395
+ self.bbox_loss_coefficient = bbox_loss_coefficient
396
+ self.giou_loss_coefficient = giou_loss_coefficient
397
+ self.eos_coefficient = eos_coefficient
398
+ self.focal_alpha = focal_alpha
399
+ self.disable_custom_kernels = disable_custom_kernels
400
+ super().__init__(**kwargs)
401
+
402
+
403
+ class LwDetrViTSelfAttention(ViTSelfAttention):
404
+ def __init__(self, config: LwDetrViTConfig):
405
+ super().__init__(config)
406
+ del self.key
407
+ self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
408
+ self.num_key_value_groups = 1
409
+ self.dropout_prob = config.dropout_prob
410
+
411
+ def forward(
412
+ self,
413
+ hidden_states: torch.Tensor,
414
+ **kwargs: Unpack[TransformersKwargs],
415
+ ) -> tuple[torch.Tensor, torch.Tensor]:
416
+ batch_size = hidden_states.shape[0]
417
+ new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size
418
+
419
+ key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2)
420
+ value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
421
+ query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
422
+
423
+ attention_interface: Callable = eager_attention_forward
424
+ if self.config._attn_implementation != "eager":
425
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
426
+
427
+ context_layer, attention_probs = attention_interface(
428
+ self,
429
+ query_layer,
430
+ key_layer,
431
+ value_layer,
432
+ None,
433
+ is_causal=self.is_causal,
434
+ scaling=self.scaling,
435
+ dropout=0.0 if not self.training else self.dropout_prob,
436
+ **kwargs,
437
+ )
438
+
439
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
440
+ context_layer = context_layer.reshape(new_context_layer_shape)
441
+
442
+ return context_layer, attention_probs
443
+
444
+
445
+ class LwDetrViTAttention(ViTAttention):
446
+ def __init__(self, config: LwDetrViTConfig):
447
+ """
448
+ Args:
449
+ config (`LwDetrViTConfig`):
450
+ Model configuration.
451
+ """
452
+ super().__init__(config)
453
+ self.attention = LwDetrViTSelfAttention(config)
454
+ self.output = nn.Linear(config.hidden_size, config.hidden_size)
455
+
456
+ def forward(
457
+ self,
458
+ hidden_states: torch.Tensor,
459
+ **kwargs: Unpack[TransformersKwargs],
460
+ ) -> torch.Tensor:
461
+ self_attn_output, _ = self.attention(hidden_states, **kwargs)
462
+ output = self.output(self_attn_output)
463
+ return output
464
+
465
+
466
+ class LwDetrViTMlp(VitDetMlp):
467
+ pass
468
+
469
+
470
+ class LwDetrViTLayer(GradientCheckpointingLayer):
471
+ def __init__(
472
+ self,
473
+ config: LwDetrViTConfig,
474
+ layer_idx,
475
+ ) -> None:
476
+ super().__init__()
477
+
478
+ dim = config.hidden_size
479
+ self.attention = LwDetrViTAttention(config)
480
+ self.intermediate = LwDetrViTMlp(config=config, in_features=dim, hidden_features=int(dim * config.mlp_ratio))
481
+ self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
482
+ self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
483
+
484
+ self.gamma_1 = nn.Parameter(torch.Tensor(dim), requires_grad=True)
485
+ self.gamma_2 = nn.Parameter(torch.Tensor(dim), requires_grad=True)
486
+
487
+ self.window = layer_idx in config.window_block_indices
488
+ self.num_windows = config.num_windows
489
+
490
+ def forward(
491
+ self,
492
+ hidden_states: torch.Tensor,
493
+ **kwargs: Unpack[TransformersKwargs],
494
+ ) -> torch.Tensor:
495
+ batch_size, seq_len, channels = hidden_states.shape
496
+ hidden_states_norm = self.layernorm_before(hidden_states)
497
+
498
+ if not self.window:
499
+ hidden_states_norm = hidden_states_norm.reshape(
500
+ batch_size // self.num_windows, self.num_windows * seq_len, channels
501
+ )
502
+
503
+ attention_output = self.attention(hidden_states_norm, **kwargs)
504
+ attention_output = attention_output * self.gamma_1
505
+
506
+ if not self.window:
507
+ attention_output = attention_output.reshape(batch_size, seq_len, channels)
508
+
509
+ hidden_states = hidden_states + attention_output
510
+
511
+ layer_output = self.layernorm_after(hidden_states)
512
+ layer_output = self.intermediate(layer_output)
513
+ layer_output = layer_output * self.gamma_2
514
+
515
+ hidden_states = hidden_states + layer_output
516
+
517
+ return hidden_states
518
+
519
+
520
+ class LwDetrViTEncoder(ViTEncoder):
521
+ def __init__(self, config: LwDetrViTConfig) -> None:
522
+ super().__init__(config)
523
+ self.layer = nn.ModuleList([LwDetrViTLayer(config, i) for i in range(config.num_hidden_layers)])
524
+
525
+ def forward(
526
+ self,
527
+ hidden_states: torch.Tensor,
528
+ **kwargs: Unpack[TransformersKwargs],
529
+ ) -> list[torch.Tensor]:
530
+ list_hidden_states = [hidden_states]
531
+ for i, layer_module in enumerate(self.layer):
532
+ hidden_states = layer_module(hidden_states, **kwargs)
533
+ list_hidden_states.append(hidden_states)
534
+ return list_hidden_states
535
+
536
+
537
+ class LwDetrViTEmbeddings(VitDetEmbeddings):
538
+ pass
539
+
540
+
541
+ class LwDetrViTPreTrainedModel(VitDetPreTrainedModel):
542
+ config: LwDetrViTConfig
543
+ base_model_prefix = "lw_detr_vit"
544
+ main_input_name = "pixel_values"
545
+ supports_gradient_checkpointing = True
546
+ _no_split_modules = ["LwDetrViTEmbeddings", "LwDetrViTLayer"]
547
+ _supports_sdpa = True
548
+ _supports_flash_attn = True
549
+ _supports_flex_attn = True
550
+ _supports_attention_backend = True
551
+ _can_record_outputs = {
552
+ "hidden_states": LwDetrViTLayer,
553
+ "attentions": LwDetrViTSelfAttention,
554
+ }
555
+
556
+ def _init_weights(self, module) -> None:
557
+ """Initialize the weights"""
558
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
559
+ init.trunc_normal_(module.weight, mean=0.0, std=self.config.initializer_range)
560
+ if module.bias is not None:
561
+ init.zeros_(module.bias)
562
+ elif isinstance(module, nn.LayerNorm):
563
+ init.zeros_(module.bias)
564
+ init.ones_(module.weight)
565
+ elif isinstance(module, LwDetrViTEmbeddings):
566
+ init.trunc_normal_(module.position_embeddings, mean=0.0, std=self.config.initializer_range)
567
+ if isinstance(module, LwDetrViTLayer):
568
+ nn.init.constant_(module.gamma_1, self.config.cae_init_values)
569
+ nn.init.constant_(module.gamma_2, self.config.cae_init_values)
570
+
571
+
572
+ @auto_docstring()
573
+ class LwDetrViTBackbone(VitDetBackbone):
574
+ @check_model_inputs
575
+ @auto_docstring
576
+ def forward(self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BackboneOutput:
577
+ r"""
578
+ Examples:
579
+
580
+ ```python
581
+ >>> from transformers import LwDetrViTConfig, LwDetrViTBackbone
582
+ >>> import torch
583
+
584
+ >>> config = LwDetrViTConfig()
585
+ >>> model = LwDetrViTBackbone(config)
586
+
587
+ >>> pixel_values = torch.randn(1, 3, 224, 224)
588
+
589
+ >>> with torch.no_grad():
590
+ ... outputs = model(pixel_values)
591
+
592
+ >>> feature_maps = outputs.feature_maps
593
+ >>> list(feature_maps[-1].shape)
594
+ [1, 768, 14, 14]
595
+ ```"""
596
+ embedding_output = self.embeddings(pixel_values)
597
+
598
+ batch_size, channels, height, width = embedding_output.shape
599
+ # (batch_size, channels, height, width) -> (batch_size, height, width, channels)
600
+ hidden_states = embedding_output.permute(0, 2, 3, 1)
601
+
602
+ window_height = height // self.config.num_windows_side
603
+ window_width = width // self.config.num_windows_side
604
+ # (batch_size, height, width, channels) -> (batch_size*num_windows_side**2, window_height*window_width, channels)
605
+ hidden_states = (
606
+ hidden_states.reshape(
607
+ batch_size,
608
+ self.config.num_windows_side,
609
+ window_height,
610
+ self.config.num_windows_side,
611
+ window_width,
612
+ channels,
613
+ )
614
+ .permute(0, 1, 3, 2, 4, 5)
615
+ .reshape(batch_size * self.config.num_windows_side**2, window_height * window_width, channels)
616
+ )
617
+
618
+ hidden_states = self.encoder(hidden_states, **kwargs)
619
+
620
+ feature_maps = ()
621
+ for stage, hidden_state in zip(self.stage_names, hidden_states):
622
+ if stage in self.out_features:
623
+ hidden_state = (
624
+ hidden_state.reshape(
625
+ batch_size,
626
+ self.config.num_windows_side,
627
+ self.config.num_windows_side,
628
+ window_height,
629
+ window_width,
630
+ channels,
631
+ )
632
+ .permute(0, 5, 1, 3, 2, 4)
633
+ .reshape(batch_size, channels, height, width)
634
+ )
635
+ feature_maps += (hidden_state,)
636
+
637
+ return BackboneOutput(feature_maps=feature_maps)
638
+
639
+
640
+ class LwDetrConvNormLayer(RTDetrConvNormLayer):
641
+ def __init__(
642
+ self,
643
+ config: LwDetrConfig,
644
+ in_channels: int,
645
+ out_channels: int,
646
+ kernel_size: int,
647
+ stride: int,
648
+ activation: str | None = None,
649
+ ):
650
+ super().__init__(config, in_channels, out_channels, kernel_size, stride, activation)
651
+ self.conv = nn.Conv2d(
652
+ in_channels,
653
+ out_channels,
654
+ kernel_size,
655
+ stride,
656
+ padding=kernel_size // 2,
657
+ bias=False,
658
+ )
659
+
660
+
661
+ class LwDetrRepVggBlock(nn.Module):
662
+ def __init__(self, config: LwDetrConfig):
663
+ super().__init__()
664
+ hidden_channels = int(config.d_model * config.hidden_expansion)
665
+ self.conv1 = LwDetrConvNormLayer(
666
+ config, hidden_channels, hidden_channels, 3, 1, activation=config.activation_function
667
+ )
668
+ self.conv2 = LwDetrConvNormLayer(
669
+ config, hidden_channels, hidden_channels, 3, 1, activation=config.activation_function
670
+ )
671
+
672
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
673
+ y = self.conv1(x)
674
+ y = self.conv2(y)
675
+ return y
676
+
677
+
678
+ class LwDetrC2FLayer(nn.Module):
679
+ # Inspired by RTDetrCSPRepLayer
680
+ def __init__(self, config: LwDetrConfig, in_channels: int):
681
+ super().__init__()
682
+ num_blocks = config.c2f_num_blocks
683
+ activation = config.activation_function
684
+ out_channels = config.d_model
685
+
686
+ self.hidden_channels = int(out_channels * config.hidden_expansion)
687
+
688
+ conv1_out_channels = 2 * self.hidden_channels
689
+ self.conv1 = LwDetrConvNormLayer(config, in_channels, conv1_out_channels, 1, 1, activation=activation)
690
+
691
+ conv2_in_channels = (2 + num_blocks) * self.hidden_channels
692
+ self.conv2 = LwDetrConvNormLayer(config, conv2_in_channels, out_channels, 1, 1, activation=activation)
693
+
694
+ self.bottlenecks = nn.ModuleList(LwDetrRepVggBlock(config) for _ in range(num_blocks))
695
+
696
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
697
+ hidden_states = self.conv1(hidden_states)
698
+ all_hidden_states = list(hidden_states.split(self.hidden_channels, 1))
699
+ hidden_states = all_hidden_states[-1]
700
+
701
+ for bottleneck in self.bottlenecks:
702
+ hidden_states = bottleneck(hidden_states)
703
+ all_hidden_states.append(hidden_states)
704
+
705
+ hidden_states = torch.cat(all_hidden_states, 1)
706
+ hidden_states = self.conv2(hidden_states)
707
+ return hidden_states
708
+
709
+
710
+ class LwDetrLayerNorm(ConvNextLayerNorm):
711
+ pass
712
+
713
+
714
+ class LwDetrSamplingLayer(nn.Module):
715
+ def __init__(self, config: LwDetrConfig, channel_size: int, scale: float):
716
+ super().__init__()
717
+
718
+ self.scale = scale
719
+ self.channel_size = channel_size
720
+
721
+ layers = []
722
+ if scale == 2.0:
723
+ if channel_size > 512:
724
+ layers.append(LwDetrConvNormLayer(config, channel_size, channel_size // 2, 1, 1, activation="relu"))
725
+ layers.append(nn.ConvTranspose2d(channel_size // 2, channel_size // 4, kernel_size=2, stride=2))
726
+ else:
727
+ layers.append(nn.ConvTranspose2d(channel_size, channel_size // 2, 2, 2))
728
+ elif scale == 0.5:
729
+ layers.append(LwDetrConvNormLayer(config, channel_size, channel_size, 3, 2, activation="relu"))
730
+ self.layers = nn.ModuleList(layers)
731
+
732
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
733
+ for layer in self.layers:
734
+ hidden_states = layer(hidden_states)
735
+ return hidden_states
736
+
737
+
738
+ class LwDetrScaleProjector(nn.Module):
739
+ def __init__(self, config: LwDetrConfig, scale: float):
740
+ super().__init__()
741
+
742
+ intermediate_dims = [config.backbone_config.hidden_size] * len(config.backbone_config.out_indices)
743
+ sampling_layers = []
744
+ for channel_size in intermediate_dims:
745
+ sampling_layers.append(LwDetrSamplingLayer(config, channel_size, scale))
746
+ self.sampling_layers = nn.ModuleList(sampling_layers)
747
+
748
+ intermediate_dim = intermediate_dims[-1]
749
+ if scale == 2.0:
750
+ if intermediate_dim > 512:
751
+ intermediate_dim = intermediate_dim // 4
752
+ else:
753
+ intermediate_dim = intermediate_dim // 2
754
+ projector_input_dim = intermediate_dim * len(intermediate_dims)
755
+
756
+ self.projector_layer = LwDetrC2FLayer(config, projector_input_dim)
757
+ self.layer_norm = LwDetrLayerNorm(config.d_model, data_format="channels_first")
758
+
759
+ def forward(self, hidden_states_tuple: tuple[torch.Tensor]) -> torch.Tensor:
760
+ sampled_hidden_states = []
761
+ for sampling_layer, hidden_states in zip(self.sampling_layers, hidden_states_tuple):
762
+ hidden_states = sampling_layer(hidden_states)
763
+ sampled_hidden_states.append(hidden_states)
764
+ hidden_states = torch.cat(sampled_hidden_states, dim=1)
765
+ hidden_states = self.projector_layer(hidden_states)
766
+ hidden_states = self.layer_norm(hidden_states)
767
+ return hidden_states
768
+
769
+
770
+ class LwDetrMultiScaleProjector(nn.Module):
771
+ def __init__(self, config: LwDetrConfig):
772
+ super().__init__()
773
+
774
+ self.config = config
775
+ scale_factors = config.projector_scale_factors
776
+
777
+ self.scale_layers = nn.ModuleList([LwDetrScaleProjector(config, scale) for scale in scale_factors])
778
+
779
+ def forward(self, hidden_states: tuple[torch.Tensor]) -> list[torch.Tensor]:
780
+ output_hidden_states = []
781
+ for scale_layer in self.scale_layers:
782
+ output_hidden_states.append(scale_layer(hidden_states))
783
+ return output_hidden_states
784
+
785
+
786
+ class LwDetrConvEncoder(nn.Module):
787
+ def __init__(self, config: LwDetrConfig):
788
+ super().__init__()
789
+ self.backbone = LwDetrViTBackbone(config.backbone_config)
790
+ self.projector = LwDetrMultiScaleProjector(config)
791
+
792
+ def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
793
+ # send pixel_values through the model to get list of feature maps
794
+ features = self.backbone(pixel_values).feature_maps
795
+ features = self.projector(features)
796
+ out = []
797
+ for feature_map in features:
798
+ # downsample pixel_mask to match shape of corresponding feature_map
799
+ mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
800
+ out.append((feature_map, mask))
801
+ return out
802
+
803
+
804
+ class LwDetrAttention(nn.Module):
805
+ def __init__(self, config: LwDetrConfig, layer_idx: int):
806
+ super().__init__()
807
+ self.config = config
808
+ self.layer_idx = layer_idx
809
+ self.head_dim = getattr(config, "head_dim", config.d_model // config.decoder_self_attention_heads)
810
+ self.scaling = self.head_dim**-0.5
811
+ self.attention_dropout = config.attention_dropout
812
+ self.is_causal = False
813
+ self.num_key_value_groups = 1
814
+
815
+ self.q_proj = nn.Linear(
816
+ config.d_model, config.decoder_self_attention_heads * self.head_dim, bias=config.attention_bias
817
+ )
818
+ self.k_proj = nn.Linear(
819
+ config.d_model, config.decoder_self_attention_heads * self.head_dim, bias=config.attention_bias
820
+ )
821
+ self.v_proj = nn.Linear(
822
+ config.d_model, config.decoder_self_attention_heads * self.head_dim, bias=config.attention_bias
823
+ )
824
+ self.o_proj = nn.Linear(
825
+ config.decoder_self_attention_heads * self.head_dim, config.d_model, bias=config.attention_bias
826
+ )
827
+
828
+ def forward(
829
+ self,
830
+ hidden_states: torch.Tensor,
831
+ position_embeddings: torch.Tensor | None = None,
832
+ **kwargs: Unpack[TransformersKwargs],
833
+ ) -> tuple[torch.Tensor, torch.Tensor]:
834
+ batch_size, seq_len, _ = hidden_states.shape
835
+ input_shape = hidden_states.shape[:-1]
836
+ hidden_shape = (*input_shape, -1, self.head_dim)
837
+
838
+ hidden_states_original = hidden_states
839
+ if position_embeddings is not None:
840
+ hidden_states = hidden_states if position_embeddings is None else hidden_states + position_embeddings
841
+
842
+ if self.training:
843
+ # at training, we use group detr technique to add more supervision by using multiple weight-sharing decoders at once for faster convergence
844
+ # at inference, we only use one decoder
845
+ hidden_states_original = torch.cat(
846
+ hidden_states_original.split(seq_len // self.config.group_detr, dim=1), dim=0
847
+ )
848
+ hidden_states = torch.cat(hidden_states.split(seq_len // self.config.group_detr, dim=1), dim=0)
849
+
850
+ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
851
+ key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
852
+ value_states = self.v_proj(hidden_states_original).view(hidden_shape).transpose(1, 2)
853
+
854
+ attention_interface: Callable = eager_attention_forward
855
+ if self.config._attn_implementation != "eager":
856
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
857
+
858
+ attn_output, attn_weights = attention_interface(
859
+ self,
860
+ query_states,
861
+ key_states,
862
+ value_states,
863
+ attention_mask=None,
864
+ dropout=0.0 if not self.training else self.attention_dropout,
865
+ scaling=self.scaling,
866
+ **kwargs,
867
+ )
868
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
869
+ attn_output = self.o_proj(attn_output)
870
+
871
+ if self.training:
872
+ attn_output = torch.cat(torch.split(attn_output, batch_size, dim=0), dim=1)
873
+
874
+ return attn_output, attn_weights
875
+
876
+
877
+ class LwDetrMultiscaleDeformableAttention(DeformableDetrMultiscaleDeformableAttention):
878
+ def forward(
879
+ self,
880
+ hidden_states: torch.Tensor,
881
+ attention_mask: torch.Tensor | None = None,
882
+ encoder_hidden_states=None,
883
+ encoder_attention_mask=None,
884
+ position_embeddings: torch.Tensor | None = None,
885
+ reference_points=None,
886
+ spatial_shapes=None,
887
+ spatial_shapes_list=None,
888
+ level_start_index=None,
889
+ **kwargs: Unpack[TransformersKwargs],
890
+ ):
891
+ return super().forward(
892
+ hidden_states=hidden_states,
893
+ attention_mask=attention_mask,
894
+ encoder_hidden_states=encoder_hidden_states,
895
+ encoder_attention_mask=encoder_attention_mask,
896
+ position_embeddings=position_embeddings,
897
+ reference_points=reference_points,
898
+ spatial_shapes=spatial_shapes,
899
+ spatial_shapes_list=spatial_shapes_list,
900
+ level_start_index=level_start_index,
901
+ **kwargs,
902
+ )
903
+
904
+
905
+ class LwDetrMLP(nn.Module):
906
+ def __init__(self, config: LwDetrConfig):
907
+ super().__init__()
908
+ self.dropout = config.dropout
909
+ self.activation_fn = ACT2FN[config.decoder_activation_function]
910
+ self.fc1 = nn.Linear(config.d_model, config.decoder_ffn_dim)
911
+ self.fc2 = nn.Linear(config.decoder_ffn_dim, config.d_model)
912
+
913
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
914
+ residual = hidden_states
915
+ hidden_states = self.fc1(hidden_states)
916
+ hidden_states = self.activation_fn(hidden_states)
917
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
918
+ hidden_states = self.fc2(hidden_states)
919
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
920
+ hidden_states = residual + hidden_states
921
+ return hidden_states
922
+
923
+
924
+ class LwDetrDecoderLayer(GradientCheckpointingLayer):
925
+ def __init__(self, config: LwDetrConfig, layer_idx: int):
926
+ nn.Module.__init__(self)
927
+
928
+ # self-attention
929
+ self.self_attn = LwDetrAttention(config, layer_idx=layer_idx)
930
+ self.dropout = config.dropout
931
+ self.activation_fn = ACT2FN[config.decoder_activation_function]
932
+ self.activation_dropout = config.activation_dropout
933
+ self.self_attn_layer_norm = nn.LayerNorm(config.d_model)
934
+
935
+ # cross-attention
936
+ self.cross_attn = LwDetrMultiscaleDeformableAttention(
937
+ config,
938
+ num_heads=config.decoder_cross_attention_heads,
939
+ n_points=config.decoder_n_points,
940
+ )
941
+ self.cross_attn_layer_norm = nn.LayerNorm(config.d_model)
942
+
943
+ # mlp
944
+ self.mlp = LwDetrMLP(config)
945
+ self.layer_norm = nn.LayerNorm(config.d_model)
946
+
947
+ def forward(
948
+ self,
949
+ hidden_states: torch.Tensor,
950
+ position_embeddings: torch.Tensor | None = None,
951
+ reference_points=None,
952
+ spatial_shapes=None,
953
+ spatial_shapes_list=None,
954
+ level_start_index=None,
955
+ encoder_hidden_states: torch.Tensor | None = None,
956
+ encoder_attention_mask: torch.Tensor | None = None,
957
+ **kwargs: Unpack[TransformersKwargs],
958
+ ):
959
+ self_attention_output, self_attn_weights = self.self_attn(
960
+ hidden_states, position_embeddings=position_embeddings, **kwargs
961
+ )
962
+
963
+ self_attention_output = nn.functional.dropout(self_attention_output, p=self.dropout, training=self.training)
964
+ hidden_states = hidden_states + self_attention_output
965
+ hidden_states = self.self_attn_layer_norm(hidden_states)
966
+
967
+ cross_attention_output, cross_attn_weights = self.cross_attn(
968
+ hidden_states=hidden_states,
969
+ attention_mask=encoder_attention_mask,
970
+ encoder_hidden_states=encoder_hidden_states,
971
+ encoder_attention_mask=encoder_attention_mask,
972
+ position_embeddings=position_embeddings,
973
+ reference_points=reference_points,
974
+ spatial_shapes=spatial_shapes,
975
+ spatial_shapes_list=spatial_shapes_list,
976
+ level_start_index=level_start_index,
977
+ **kwargs,
978
+ )
979
+ cross_attention_output = nn.functional.dropout(cross_attention_output, p=self.dropout, training=self.training)
980
+ hidden_states = hidden_states + cross_attention_output
981
+ hidden_states = self.cross_attn_layer_norm(hidden_states)
982
+
983
+ hidden_states = self.mlp(hidden_states)
984
+ hidden_states = self.layer_norm(hidden_states)
985
+
986
+ return hidden_states
987
+
988
+
989
+ @auto_docstring
990
+ class LwDetrPreTrainedModel(PreTrainedModel):
991
+ config: LwDetrConfig
992
+ base_model_prefix = "model"
993
+ main_input_name = "pixel_values"
994
+ _no_split_modules = [
995
+ r"LwDetrConvEncoder",
996
+ r"LwDetrDecoderLayer",
997
+ ]
998
+ _supports_sdpa = True
999
+ _supports_flash_attn = True
1000
+ _supports_flex_attn = True
1001
+ _supports_attention_backend = True
1002
+ _can_record_outputs = {
1003
+ "attentions": [LwDetrAttention, LwDetrMultiscaleDeformableAttention],
1004
+ "hidden_states": [LwDetrDecoderLayer],
1005
+ }
1006
+
1007
+ @torch.no_grad()
1008
+ def _init_weights(self, module):
1009
+ super()._init_weights(module)
1010
+
1011
+ if isinstance(module, LwDetrMultiscaleDeformableAttention):
1012
+ init.constant_(module.sampling_offsets.weight, 0.0)
1013
+ thetas = torch.arange(module.n_heads, dtype=torch.int64).float() * (2.0 * math.pi / module.n_heads)
1014
+ grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
1015
+ grid_init = (
1016
+ (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
1017
+ .view(module.n_heads, 1, 1, 2)
1018
+ .repeat(1, module.n_levels, module.n_points, 1)
1019
+ )
1020
+ for i in range(module.n_points):
1021
+ grid_init[:, :, i, :] *= i + 1
1022
+
1023
+ init.copy_(module.sampling_offsets.bias, grid_init.view(-1))
1024
+ init.constant_(module.attention_weights.weight, 0.0)
1025
+ init.constant_(module.attention_weights.bias, 0.0)
1026
+ init.xavier_uniform_(module.value_proj.weight)
1027
+ init.constant_(module.value_proj.bias, 0.0)
1028
+ init.xavier_uniform_(module.output_proj.weight)
1029
+ init.constant_(module.output_proj.bias, 0.0)
1030
+ if hasattr(module, "level_embed"):
1031
+ init.normal_(module.level_embed)
1032
+ if hasattr(module, "refpoint_embed") and module.refpoint_embed is not None:
1033
+ init.constant_(module.refpoint_embed.weight, 0)
1034
+ if hasattr(module, "class_embed") and module.class_embed is not None:
1035
+ prior_prob = 0.01
1036
+ bias_value = -math.log((1 - prior_prob) / prior_prob)
1037
+ init.constant_(module.class_embed.bias, bias_value)
1038
+ if hasattr(module, "bbox_embed") and module.bbox_embed is not None:
1039
+ init.constant_(module.bbox_embed.layers[-1].weight, 0)
1040
+ init.constant_(module.bbox_embed.layers[-1].bias, 0)
1041
+
1042
+
1043
+ def refine_bboxes(reference_points, deltas):
1044
+ reference_points = reference_points.to(deltas.device)
1045
+ new_reference_points_cxcy = deltas[..., :2] * reference_points[..., 2:] + reference_points[..., :2]
1046
+ new_reference_points_wh = deltas[..., 2:].exp() * reference_points[..., 2:]
1047
+ new_reference_points = torch.cat((new_reference_points_cxcy, new_reference_points_wh), -1)
1048
+ return new_reference_points
1049
+
1050
+
1051
+ @dataclass
1052
+ @auto_docstring(
1053
+ custom_intro="""
1054
+ Base class for outputs of the LwDetrDecoder. This class adds two attributes to
1055
+ BaseModelOutputWithCrossAttentions, namely:
1056
+ - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
1057
+ - a stacked tensor of intermediate reference points.
1058
+ """
1059
+ )
1060
+ class LwDetrDecoderOutput(DeformableDetrDecoderOutput):
1061
+ pass
1062
+
1063
+
1064
+ class LwDetrDecoder(LwDetrPreTrainedModel):
1065
+ """
1066
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DeformableDetrDecoderLayer`].
1067
+
1068
+ The decoder updates the query embeddings through multiple self-attention and deformable cross-attention layers.
1069
+
1070
+ Some tweaks for LwDetr:
1071
+
1072
+ - it uses group detr technique at training for faster convergence.
1073
+
1074
+ Args:
1075
+ config: LwDetrConfig
1076
+ """
1077
+
1078
+ def __init__(self, config: LwDetrConfig):
1079
+ super().__init__(config)
1080
+ self.dropout = config.dropout
1081
+ self.layers = nn.ModuleList([LwDetrDecoderLayer(config, i) for i in range(config.decoder_layers)])
1082
+ self.layernorm = nn.LayerNorm(config.d_model)
1083
+
1084
+ self.gradient_checkpointing = False
1085
+
1086
+ self.ref_point_head = LwDetrMLPPredictionHead(2 * config.d_model, config.d_model, config.d_model, num_layers=2)
1087
+
1088
+ self.post_init()
1089
+
1090
+ def get_reference(self, reference_points, valid_ratios):
1091
+ # batch_size, num_queries, batch_size, 4
1092
+ obj_center = reference_points[..., :4]
1093
+
1094
+ # batch_size, num_queries, num_levels, 4
1095
+ reference_points_inputs = obj_center[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
1096
+
1097
+ # batch_size, num_queries, d_model * 2
1098
+ query_sine_embed = gen_sine_position_embeddings(reference_points_inputs[:, :, 0, :], self.config.d_model)
1099
+
1100
+ # batch_size, num_queries, d_model
1101
+ query_pos = self.ref_point_head(query_sine_embed)
1102
+ return reference_points_inputs, query_pos
1103
+
1104
+ def forward(
1105
+ self,
1106
+ inputs_embeds: torch.Tensor | None = None,
1107
+ reference_points: torch.Tensor | None = None,
1108
+ spatial_shapes: torch.Tensor | None = None,
1109
+ spatial_shapes_list: torch.Tensor | None = None,
1110
+ level_start_index: torch.Tensor | None = None,
1111
+ valid_ratios: torch.Tensor | None = None,
1112
+ encoder_hidden_states: torch.Tensor | None = None,
1113
+ encoder_attention_mask: torch.Tensor | None = None,
1114
+ **kwargs: Unpack[TransformersKwargs],
1115
+ ):
1116
+ intermediate = ()
1117
+ intermediate_reference_points = (reference_points,)
1118
+
1119
+ if inputs_embeds is not None:
1120
+ hidden_states = inputs_embeds
1121
+
1122
+ reference_points_inputs, query_pos = self.get_reference(reference_points, valid_ratios)
1123
+
1124
+ for idx, decoder_layer in enumerate(self.layers):
1125
+ hidden_states = decoder_layer(
1126
+ hidden_states,
1127
+ encoder_hidden_states=encoder_hidden_states,
1128
+ encoder_attention_mask=encoder_attention_mask,
1129
+ position_embeddings=query_pos,
1130
+ reference_points=reference_points_inputs,
1131
+ spatial_shapes=spatial_shapes,
1132
+ spatial_shapes_list=spatial_shapes_list,
1133
+ level_start_index=level_start_index,
1134
+ **kwargs,
1135
+ )
1136
+ intermediate_hidden_states = self.layernorm(hidden_states)
1137
+ intermediate += (intermediate_hidden_states,)
1138
+
1139
+ intermediate = torch.stack(intermediate)
1140
+ last_hidden_state = intermediate[-1]
1141
+ intermediate_reference_points = torch.stack(intermediate_reference_points)
1142
+
1143
+ return LwDetrDecoderOutput(
1144
+ last_hidden_state=last_hidden_state,
1145
+ intermediate_hidden_states=intermediate,
1146
+ intermediate_reference_points=intermediate_reference_points,
1147
+ )
1148
+
1149
+
1150
+ @dataclass
1151
+ @auto_docstring(
1152
+ custom_intro="""
1153
+ Base class for outputs of the LwDetr backbone-decoder model.
1154
+ """
1155
+ )
1156
+ class LwDetrModelOutput(ModelOutput):
1157
+ r"""
1158
+ init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
1159
+ Initial reference points sent through the Transformer decoder.
1160
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
1161
+ Stacked intermediate hidden states (output of each layer of the decoder).
1162
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
1163
+ Stacked intermediate reference points (reference points of each layer of the decoder).
1164
+ enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
1165
+ Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
1166
+ picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
1167
+ foreground and background).
1168
+ enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
1169
+ Logits of predicted bounding boxes coordinates in the first stage.
1170
+ """
1171
+
1172
+ init_reference_points: torch.FloatTensor | None = None
1173
+ last_hidden_state: torch.FloatTensor | None = None
1174
+ intermediate_hidden_states: torch.FloatTensor | None = None
1175
+ intermediate_reference_points: torch.FloatTensor | None = None
1176
+ enc_outputs_class: torch.FloatTensor | None = None
1177
+ enc_outputs_coord_logits: torch.FloatTensor | None = None
1178
+
1179
+
1180
+ @auto_docstring(
1181
+ custom_intro="""
1182
+ The bare LW Detr Model (consisting of a backbone and decoder Transformer) outputting raw
1183
+ hidden-states without any specific head on top.
1184
+ """
1185
+ )
1186
+ class LwDetrModel(DeformableDetrModel):
1187
+ def __init__(self, config: LwDetrConfig):
1188
+ LwDetrPreTrainedModel.__init__(config)
1189
+
1190
+ # Create backbone + positional encoding
1191
+ self.backbone = LwDetrConvEncoder(config)
1192
+
1193
+ self.group_detr = config.group_detr
1194
+ self.num_queries = config.num_queries
1195
+ hidden_dim = config.d_model
1196
+ self.reference_point_embed = nn.Embedding(self.num_queries * self.group_detr, 4)
1197
+ self.query_feat = nn.Embedding(self.num_queries * self.group_detr, hidden_dim)
1198
+
1199
+ self.decoder = LwDetrDecoder(config)
1200
+
1201
+ self.enc_output = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for _ in range(self.group_detr)])
1202
+ self.enc_output_norm = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(self.group_detr)])
1203
+ # Should normally be None and then instantiated in the ForObjectDetection class
1204
+ self.enc_out_bbox_embed = nn.ModuleList(
1205
+ [LwDetrMLPPredictionHead(config.d_model, config.d_model, 4, num_layers=3) for _ in range(self.group_detr)]
1206
+ )
1207
+ self.enc_out_class_embed = nn.ModuleList(
1208
+ [nn.Linear(config.d_model, config.num_labels) for _ in range(self.group_detr)]
1209
+ )
1210
+
1211
+ self.post_init()
1212
+
1213
+ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
1214
+ """Generate the encoder output proposals from encoded enc_output.
1215
+
1216
+ Args:
1217
+ enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
1218
+ padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
1219
+ spatial_shapes (list[tuple[int, int]]): Spatial shapes of the feature maps.
1220
+
1221
+ Returns:
1222
+ `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
1223
+ - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
1224
+ directly predict a bounding box. (without the need of a decoder)
1225
+ - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
1226
+ sigmoid.
1227
+ """
1228
+ batch_size = enc_output.shape[0]
1229
+ proposals = []
1230
+ _cur = 0
1231
+ for level, (height, width) in enumerate(spatial_shapes):
1232
+ mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)
1233
+ valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
1234
+ valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
1235
+
1236
+ grid_y, grid_x = meshgrid(
1237
+ torch.linspace(
1238
+ 0,
1239
+ height - 1,
1240
+ height,
1241
+ dtype=enc_output.dtype,
1242
+ device=enc_output.device,
1243
+ ),
1244
+ torch.linspace(
1245
+ 0,
1246
+ width - 1,
1247
+ width,
1248
+ dtype=enc_output.dtype,
1249
+ device=enc_output.device,
1250
+ ),
1251
+ indexing="ij",
1252
+ )
1253
+ grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
1254
+
1255
+ scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
1256
+ grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
1257
+ width_height = torch.ones_like(grid) * 0.05 * (2.0**level)
1258
+ proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4)
1259
+ proposals.append(proposal)
1260
+ _cur += height * width
1261
+ output_proposals = torch.cat(proposals, 1)
1262
+ output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
1263
+ output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
1264
+ output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
1265
+
1266
+ # assign each pixel as an object query
1267
+ object_query = enc_output
1268
+ object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
1269
+ object_query = object_query.masked_fill(~output_proposals_valid, float(0))
1270
+ return object_query, output_proposals
1271
+
1272
+ @check_model_inputs
1273
+ @auto_docstring
1274
+ def forward(
1275
+ self,
1276
+ pixel_values: torch.FloatTensor = None,
1277
+ pixel_mask: torch.LongTensor | None = None,
1278
+ **kwargs: Unpack[TransformersKwargs],
1279
+ ) -> LwDetrModelOutput:
1280
+ r"""
1281
+ Examples:
1282
+
1283
+ ```python
1284
+ >>> from transformers import AutoImageProcessor, DeformableDetrModel
1285
+ >>> from PIL import Image
1286
+ >>> import requests
1287
+
1288
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1289
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1290
+
1291
+ >>> image_processor = AutoImageProcessor.from_pretrained("stevenbucaille/lwdetr_small_60e_coco")
1292
+ >>> model = DeformableDetrModel.from_pretrained("stevenbucaille/lwdetr_small_60e_coco")
1293
+
1294
+ >>> inputs = image_processor(images=image, return_tensors="pt")
1295
+
1296
+ >>> outputs = model(**inputs)
1297
+
1298
+ >>> last_hidden_states = outputs.last_hidden_state
1299
+ >>> list(last_hidden_states.shape)
1300
+ [1, 300, 256]
1301
+ ```"""
1302
+ batch_size, num_channels, height, width = pixel_values.shape
1303
+ device = pixel_values.device
1304
+
1305
+ if pixel_mask is None:
1306
+ pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
1307
+
1308
+ # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
1309
+ # First, sent pixel_values + pixel_mask through Backbone to obtain the features
1310
+ # which is a list of tuples
1311
+ features = self.backbone(pixel_values, pixel_mask)
1312
+
1313
+ # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
1314
+ sources = []
1315
+ masks = []
1316
+ for level, (source, mask) in enumerate(features):
1317
+ sources.append(source)
1318
+ masks.append(mask)
1319
+ if mask is None:
1320
+ raise ValueError("No attention mask was provided")
1321
+
1322
+ if self.training:
1323
+ reference_points = self.reference_point_embed.weight
1324
+ query_feat = self.query_feat.weight
1325
+ else:
1326
+ # only use one group in inference
1327
+ reference_points = self.reference_point_embed.weight[: self.num_queries]
1328
+ query_feat = self.query_feat.weight[: self.num_queries]
1329
+
1330
+ # Prepare encoder inputs (by flattening)
1331
+ source_flatten = []
1332
+ mask_flatten = []
1333
+ spatial_shapes_list = []
1334
+ for source, mask in zip(sources, masks):
1335
+ batch_size, num_channels, height, width = source.shape
1336
+ spatial_shape = (height, width)
1337
+ spatial_shapes_list.append(spatial_shape)
1338
+ source = source.flatten(2).transpose(1, 2)
1339
+ mask = mask.flatten(1)
1340
+ source_flatten.append(source)
1341
+ mask_flatten.append(mask)
1342
+ source_flatten = torch.cat(source_flatten, 1)
1343
+ mask_flatten = torch.cat(mask_flatten, 1)
1344
+ spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
1345
+ level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
1346
+ valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1)
1347
+
1348
+ target = query_feat.unsqueeze(0).expand(batch_size, -1, -1)
1349
+ reference_points = reference_points.unsqueeze(0).expand(batch_size, -1, -1)
1350
+
1351
+ object_query_embedding, output_proposals = self.gen_encoder_output_proposals(
1352
+ source_flatten, ~mask_flatten, spatial_shapes_list
1353
+ )
1354
+
1355
+ group_detr = self.group_detr if self.training else 1
1356
+ topk = self.num_queries
1357
+ topk_coords_logits = []
1358
+ topk_coords_logits_undetach = []
1359
+ object_query_undetach = []
1360
+
1361
+ for group_id in range(group_detr):
1362
+ group_object_query = self.enc_output[group_id](object_query_embedding)
1363
+ group_object_query = self.enc_output_norm[group_id](group_object_query)
1364
+
1365
+ group_enc_outputs_class = self.enc_out_class_embed[group_id](group_object_query)
1366
+ group_delta_bbox = self.enc_out_bbox_embed[group_id](group_object_query)
1367
+ group_enc_outputs_coord = refine_bboxes(output_proposals, group_delta_bbox)
1368
+
1369
+ group_topk_proposals = torch.topk(group_enc_outputs_class.max(-1)[0], topk, dim=1)[1]
1370
+ group_topk_coords_logits_undetach = torch.gather(
1371
+ group_enc_outputs_coord,
1372
+ 1,
1373
+ group_topk_proposals.unsqueeze(-1).repeat(1, 1, 4),
1374
+ )
1375
+ group_topk_coords_logits = group_topk_coords_logits_undetach.detach()
1376
+ group_object_query_undetach = torch.gather(
1377
+ group_object_query, 1, group_topk_proposals.unsqueeze(-1).repeat(1, 1, self.config.d_model)
1378
+ )
1379
+
1380
+ topk_coords_logits.append(group_topk_coords_logits)
1381
+ topk_coords_logits_undetach.append(group_topk_coords_logits_undetach)
1382
+ object_query_undetach.append(group_object_query_undetach)
1383
+
1384
+ topk_coords_logits = torch.cat(topk_coords_logits, 1)
1385
+ topk_coords_logits_undetach = torch.cat(topk_coords_logits_undetach, 1)
1386
+ object_query_undetach = torch.cat(object_query_undetach, 1)
1387
+
1388
+ enc_outputs_class = object_query_undetach
1389
+ enc_outputs_coord_logits = topk_coords_logits
1390
+
1391
+ reference_points = refine_bboxes(topk_coords_logits_undetach, reference_points)
1392
+
1393
+ init_reference_points = reference_points
1394
+ decoder_outputs = self.decoder(
1395
+ inputs_embeds=target,
1396
+ reference_points=reference_points,
1397
+ spatial_shapes=spatial_shapes,
1398
+ spatial_shapes_list=spatial_shapes_list,
1399
+ level_start_index=level_start_index,
1400
+ valid_ratios=valid_ratios,
1401
+ encoder_hidden_states=source_flatten,
1402
+ encoder_attention_mask=mask_flatten,
1403
+ **kwargs,
1404
+ )
1405
+
1406
+ return LwDetrModelOutput(
1407
+ init_reference_points=init_reference_points,
1408
+ last_hidden_state=decoder_outputs.last_hidden_state,
1409
+ intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
1410
+ intermediate_reference_points=decoder_outputs.intermediate_reference_points,
1411
+ enc_outputs_class=enc_outputs_class,
1412
+ enc_outputs_coord_logits=enc_outputs_coord_logits,
1413
+ )
1414
+
1415
+
1416
+ class LwDetrMLPPredictionHead(DeformableDetrMLPPredictionHead):
1417
+ pass
1418
+
1419
+
1420
+ @dataclass
1421
+ @auto_docstring(
1422
+ custom_intro="""
1423
+ Output type of [`LwDetrForObjectDetection`].
1424
+ """
1425
+ )
1426
+ class LwDetrObjectDetectionOutput(ModelOutput):
1427
+ r"""
1428
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
1429
+ Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
1430
+ bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
1431
+ scale-invariant IoU loss.
1432
+ loss_dict (`Dict`, *optional*):
1433
+ A dictionary containing the individual losses. Useful for logging.
1434
+ logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
1435
+ Classification logits (including no-object) for all queries.
1436
+ pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
1437
+ Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
1438
+ values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
1439
+ possible padding). You can use [`~DeformableDetrProcessor.post_process_object_detection`] to retrieve the
1440
+ unnormalized bounding boxes.
1441
+ auxiliary_outputs (`list[Dict]`, *optional*):
1442
+ Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
1443
+ and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
1444
+ `pred_boxes`) for each decoder layer.
1445
+ init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
1446
+ Initial reference points sent through the Transformer decoder.
1447
+ intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
1448
+ Stacked intermediate hidden states (output of each layer of the decoder).
1449
+ intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
1450
+ Stacked intermediate reference points (reference points of each layer of the decoder).
1451
+ enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
1452
+ Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
1453
+ picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
1454
+ foreground and background).
1455
+ enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
1456
+ Logits of predicted bounding boxes coordinates in the first stage.
1457
+ """
1458
+
1459
+ loss: torch.FloatTensor | None = None
1460
+ loss_dict: dict | None = None
1461
+ logits: torch.FloatTensor | None = None
1462
+ pred_boxes: torch.FloatTensor | None = None
1463
+ auxiliary_outputs: list[dict] | None = None
1464
+ init_reference_points: torch.FloatTensor | None = None
1465
+ last_hidden_state: torch.FloatTensor | None = None
1466
+ intermediate_hidden_states: torch.FloatTensor | None = None
1467
+ intermediate_reference_points: torch.FloatTensor | None = None
1468
+ enc_outputs_class: Any = None
1469
+ enc_outputs_coord_logits: torch.FloatTensor | None = None
1470
+
1471
+
1472
+ @auto_docstring(
1473
+ custom_intro="""
1474
+ LW DETR Model (consisting of a backbone and decoder Transformer) with object detection heads on
1475
+ top, for tasks such as COCO detection.
1476
+ """
1477
+ )
1478
+ class LwDetrForObjectDetection(DeformableDetrForObjectDetection):
1479
+ _tied_weights_keys = None
1480
+
1481
+ def __init__(self, config: LwDetrConfig):
1482
+ PreTrainedModel.__init__(self, config)
1483
+ self.model = LwDetrModel(config)
1484
+ self.class_embed = nn.Linear(config.d_model, config.num_labels)
1485
+ self.bbox_embed = LwDetrMLPPredictionHead(config.d_model, config.d_model, 4, num_layers=3)
1486
+
1487
+ self.post_init()
1488
+
1489
+ @check_model_inputs
1490
+ @auto_docstring
1491
+ def forward(
1492
+ self,
1493
+ pixel_values: torch.FloatTensor = None,
1494
+ pixel_mask: torch.LongTensor | None = None,
1495
+ labels: list[dict] | None = None,
1496
+ **kwargs: Unpack[TransformersKwargs],
1497
+ ) -> LwDetrObjectDetectionOutput:
1498
+ r"""
1499
+ decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
1500
+ Not used by default. Can be used to mask object queries.
1501
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1502
+ Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
1503
+ can choose to directly pass a flattened representation of an image.
1504
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
1505
+ Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
1506
+ embedded representation.
1507
+ labels (`list[Dict]` of len `(batch_size,)`, *optional*):
1508
+ Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
1509
+ following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
1510
+ respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
1511
+ in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
1512
+
1513
+ Examples:
1514
+
1515
+ ```python
1516
+ >>> from transformers import AutoImageProcessor, LwDetrForObjectDetection
1517
+ >>> from PIL import Image
1518
+ >>> import requests
1519
+
1520
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1521
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1522
+
1523
+ >>> image_processor = AutoImageProcessor.from_pretrained("stevenbucaille/lwdetr_small_60e_coco")
1524
+ >>> model = LwDetrForObjectDetection.from_pretrained("stevenbucaille/lwdetr_small_60e_coco")
1525
+
1526
+ >>> inputs = image_processor(images=image, return_tensors="pt")
1527
+ >>> outputs = model(**inputs)
1528
+
1529
+ >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
1530
+ >>> target_sizes = torch.tensor([image.size[::-1]])
1531
+ >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
1532
+ ... 0
1533
+ ... ]
1534
+ >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
1535
+ ... box = [round(i, 2) for i in box.tolist()]
1536
+ ... print(
1537
+ ... f"Detected {model.config.id2label[label.item()]} with confidence "
1538
+ ... f"{round(score.item(), 3)} at location {box}"
1539
+ ... )
1540
+ Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78]
1541
+ Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25]
1542
+ Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25]
1543
+ ```"""
1544
+ outputs = self.model(
1545
+ pixel_values,
1546
+ pixel_mask=pixel_mask,
1547
+ **kwargs,
1548
+ )
1549
+
1550
+ last_hidden_states = outputs.last_hidden_state
1551
+ intermediate_reference_points = outputs.intermediate_reference_points
1552
+ enc_outputs_class_logits = outputs.enc_outputs_class
1553
+ enc_outputs_boxes_logits = outputs.enc_outputs_coord_logits
1554
+
1555
+ logits = self.class_embed(last_hidden_states)
1556
+ pred_boxes_delta = self.bbox_embed(last_hidden_states)
1557
+ pred_boxes = refine_bboxes(intermediate_reference_points[-1], pred_boxes_delta)
1558
+
1559
+ enc_outputs_class_logits_list = enc_outputs_class_logits.split(self.config.num_queries, dim=1)
1560
+ pred_class = []
1561
+ group_detr = self.config.group_detr if self.training else 1
1562
+ for group_index in range(group_detr):
1563
+ group_pred_class = self.model.enc_out_class_embed[group_index](enc_outputs_class_logits_list[group_index])
1564
+ pred_class.append(group_pred_class)
1565
+ enc_outputs_class_logits = torch.cat(pred_class, dim=1)
1566
+
1567
+ loss, loss_dict, auxiliary_outputs = None, None, None
1568
+ if labels is not None:
1569
+ outputs_class, outputs_coord = None, None
1570
+ if self.config.auxiliary_loss:
1571
+ intermediate_hidden_states = outputs.intermediate_hidden_states
1572
+ outputs_coord_delta = self.bbox_embed(intermediate_hidden_states)
1573
+ outputs_coord = refine_bboxes(intermediate_reference_points, outputs_coord_delta)
1574
+ outputs_class = self.class_embed(intermediate_hidden_states)
1575
+
1576
+ loss, loss_dict, auxiliary_outputs = self.loss_function(
1577
+ logits,
1578
+ labels,
1579
+ self.device,
1580
+ pred_boxes,
1581
+ self.config,
1582
+ outputs_class,
1583
+ outputs_coord,
1584
+ enc_outputs_class_logits,
1585
+ enc_outputs_boxes_logits,
1586
+ )
1587
+
1588
+ return LwDetrObjectDetectionOutput(
1589
+ loss=loss,
1590
+ loss_dict=loss_dict,
1591
+ logits=logits,
1592
+ pred_boxes=pred_boxes,
1593
+ auxiliary_outputs=auxiliary_outputs,
1594
+ last_hidden_state=outputs.last_hidden_state,
1595
+ intermediate_hidden_states=outputs.intermediate_hidden_states,
1596
+ intermediate_reference_points=outputs.intermediate_reference_points,
1597
+ init_reference_points=outputs.init_reference_points,
1598
+ enc_outputs_class=enc_outputs_class_logits,
1599
+ enc_outputs_coord_logits=enc_outputs_boxes_logits,
1600
+ )
1601
+
1602
+
1603
+ __all__ = [
1604
+ "LwDetrConfig",
1605
+ "LwDetrPreTrainedModel",
1606
+ "LwDetrModel",
1607
+ "LwDetrForObjectDetection",
1608
+ "LwDetrViTConfig",
1609
+ "LwDetrViTPreTrainedModel",
1610
+ "LwDetrViTBackbone",
1611
+ ]