transformers 5.0.0__py3-none-any.whl → 5.0.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1606) hide show
  1. transformers/__init__.py +36 -55
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +33 -32
  4. transformers/cache_utils.py +139 -32
  5. transformers/cli/chat.py +3 -3
  6. transformers/cli/serve.py +19 -49
  7. transformers/cli/transformers.py +1 -2
  8. transformers/configuration_utils.py +155 -129
  9. transformers/conversion_mapping.py +22 -158
  10. transformers/convert_slow_tokenizer.py +17 -227
  11. transformers/core_model_loading.py +185 -528
  12. transformers/data/data_collator.py +4 -12
  13. transformers/data/processors/glue.py +1 -0
  14. transformers/data/processors/utils.py +1 -0
  15. transformers/data/processors/xnli.py +1 -0
  16. transformers/dependency_versions_check.py +1 -0
  17. transformers/dependency_versions_table.py +7 -5
  18. transformers/distributed/configuration_utils.py +2 -1
  19. transformers/dynamic_module_utils.py +25 -24
  20. transformers/feature_extraction_sequence_utils.py +23 -19
  21. transformers/feature_extraction_utils.py +33 -64
  22. transformers/file_utils.py +1 -0
  23. transformers/generation/__init__.py +1 -11
  24. transformers/generation/candidate_generator.py +33 -80
  25. transformers/generation/configuration_utils.py +133 -189
  26. transformers/generation/continuous_batching/__init__.py +1 -4
  27. transformers/generation/continuous_batching/cache.py +25 -83
  28. transformers/generation/continuous_batching/cache_manager.py +45 -155
  29. transformers/generation/continuous_batching/continuous_api.py +147 -270
  30. transformers/generation/continuous_batching/requests.py +3 -51
  31. transformers/generation/continuous_batching/scheduler.py +105 -160
  32. transformers/generation/logits_process.py +128 -0
  33. transformers/generation/stopping_criteria.py +1 -1
  34. transformers/generation/streamers.py +1 -0
  35. transformers/generation/utils.py +123 -122
  36. transformers/generation/watermarking.py +6 -8
  37. transformers/hf_argparser.py +13 -9
  38. transformers/hyperparameter_search.py +2 -1
  39. transformers/image_processing_base.py +23 -12
  40. transformers/image_processing_utils.py +15 -11
  41. transformers/image_processing_utils_fast.py +75 -85
  42. transformers/image_transforms.py +42 -73
  43. transformers/image_utils.py +32 -30
  44. transformers/initialization.py +0 -37
  45. transformers/integrations/__init__.py +2 -16
  46. transformers/integrations/accelerate.py +113 -58
  47. transformers/integrations/aqlm.py +66 -36
  48. transformers/integrations/awq.py +516 -45
  49. transformers/integrations/bitnet.py +105 -47
  50. transformers/integrations/bitsandbytes.py +202 -91
  51. transformers/integrations/deepspeed.py +4 -161
  52. transformers/integrations/eetq.py +82 -84
  53. transformers/integrations/executorch.py +1 -1
  54. transformers/integrations/fbgemm_fp8.py +145 -190
  55. transformers/integrations/finegrained_fp8.py +215 -249
  56. transformers/integrations/flash_attention.py +3 -3
  57. transformers/integrations/flex_attention.py +1 -1
  58. transformers/integrations/fp_quant.py +0 -90
  59. transformers/integrations/ggml.py +2 -11
  60. transformers/integrations/higgs.py +62 -37
  61. transformers/integrations/hub_kernels.py +8 -65
  62. transformers/integrations/integration_utils.py +3 -47
  63. transformers/integrations/mistral.py +0 -12
  64. transformers/integrations/mxfp4.py +80 -33
  65. transformers/integrations/peft.py +191 -483
  66. transformers/integrations/quanto.py +56 -77
  67. transformers/integrations/spqr.py +90 -42
  68. transformers/integrations/tensor_parallel.py +221 -167
  69. transformers/integrations/torchao.py +43 -35
  70. transformers/integrations/vptq.py +59 -40
  71. transformers/kernels/__init__.py +0 -0
  72. transformers/{models/pe_audio_video/processing_pe_audio_video.py → kernels/falcon_mamba/__init__.py} +3 -12
  73. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +529 -0
  74. transformers/loss/loss_utils.py +0 -2
  75. transformers/masking_utils.py +55 -51
  76. transformers/model_debugging_utils.py +5 -4
  77. transformers/modelcard.py +194 -15
  78. transformers/modeling_attn_mask_utils.py +19 -19
  79. transformers/modeling_flash_attention_utils.py +27 -27
  80. transformers/modeling_gguf_pytorch_utils.py +24 -79
  81. transformers/modeling_layers.py +22 -21
  82. transformers/modeling_outputs.py +253 -242
  83. transformers/modeling_rope_utils.py +117 -138
  84. transformers/modeling_utils.py +739 -850
  85. transformers/models/__init__.py +0 -27
  86. transformers/models/afmoe/configuration_afmoe.py +33 -40
  87. transformers/models/afmoe/modeling_afmoe.py +54 -42
  88. transformers/models/afmoe/modular_afmoe.py +33 -23
  89. transformers/models/aimv2/configuration_aimv2.py +10 -2
  90. transformers/models/aimv2/modeling_aimv2.py +42 -47
  91. transformers/models/aimv2/modular_aimv2.py +19 -17
  92. transformers/models/albert/configuration_albert.py +2 -8
  93. transformers/models/albert/modeling_albert.py +69 -70
  94. transformers/models/albert/tokenization_albert.py +14 -5
  95. transformers/models/align/configuration_align.py +6 -8
  96. transformers/models/align/modeling_align.py +89 -94
  97. transformers/models/align/processing_align.py +30 -2
  98. transformers/models/altclip/configuration_altclip.py +7 -4
  99. transformers/models/altclip/modeling_altclip.py +103 -114
  100. transformers/models/altclip/processing_altclip.py +15 -2
  101. transformers/models/apertus/__init__.py +1 -0
  102. transformers/models/apertus/configuration_apertus.py +28 -23
  103. transformers/models/apertus/modeling_apertus.py +40 -39
  104. transformers/models/apertus/modular_apertus.py +38 -37
  105. transformers/models/arcee/configuration_arcee.py +30 -25
  106. transformers/models/arcee/modeling_arcee.py +39 -36
  107. transformers/models/arcee/modular_arcee.py +23 -20
  108. transformers/models/aria/configuration_aria.py +44 -31
  109. transformers/models/aria/image_processing_aria.py +27 -25
  110. transformers/models/aria/modeling_aria.py +106 -110
  111. transformers/models/aria/modular_aria.py +127 -118
  112. transformers/models/aria/processing_aria.py +35 -28
  113. transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +1 -0
  114. transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +6 -3
  115. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +8 -6
  116. transformers/models/audioflamingo3/__init__.py +1 -0
  117. transformers/models/audioflamingo3/configuration_audioflamingo3.py +1 -0
  118. transformers/models/audioflamingo3/modeling_audioflamingo3.py +49 -58
  119. transformers/models/audioflamingo3/modular_audioflamingo3.py +43 -53
  120. transformers/models/audioflamingo3/processing_audioflamingo3.py +30 -33
  121. transformers/models/auto/auto_factory.py +7 -6
  122. transformers/models/auto/configuration_auto.py +5 -66
  123. transformers/models/auto/feature_extraction_auto.py +10 -14
  124. transformers/models/auto/image_processing_auto.py +41 -32
  125. transformers/models/auto/modeling_auto.py +188 -46
  126. transformers/models/auto/processing_auto.py +11 -24
  127. transformers/models/auto/tokenization_auto.py +588 -171
  128. transformers/models/auto/video_processing_auto.py +10 -12
  129. transformers/models/autoformer/configuration_autoformer.py +7 -4
  130. transformers/models/autoformer/modeling_autoformer.py +101 -104
  131. transformers/models/aya_vision/configuration_aya_vision.py +1 -4
  132. transformers/models/aya_vision/modeling_aya_vision.py +102 -71
  133. transformers/models/aya_vision/modular_aya_vision.py +74 -46
  134. transformers/models/aya_vision/processing_aya_vision.py +53 -25
  135. transformers/models/bamba/configuration_bamba.py +39 -34
  136. transformers/models/bamba/modeling_bamba.py +86 -82
  137. transformers/models/bamba/modular_bamba.py +72 -70
  138. transformers/models/bark/configuration_bark.py +8 -6
  139. transformers/models/bark/generation_configuration_bark.py +5 -3
  140. transformers/models/bark/modeling_bark.py +57 -54
  141. transformers/models/bark/processing_bark.py +41 -19
  142. transformers/models/bart/configuration_bart.py +6 -9
  143. transformers/models/bart/modeling_bart.py +126 -135
  144. transformers/models/barthez/tokenization_barthez.py +11 -3
  145. transformers/models/bartpho/tokenization_bartpho.py +7 -6
  146. transformers/models/beit/configuration_beit.py +11 -0
  147. transformers/models/beit/image_processing_beit.py +56 -53
  148. transformers/models/beit/image_processing_beit_fast.py +12 -10
  149. transformers/models/beit/modeling_beit.py +60 -69
  150. transformers/models/bert/configuration_bert.py +2 -12
  151. transformers/models/bert/modeling_bert.py +122 -114
  152. transformers/models/bert/tokenization_bert.py +23 -8
  153. transformers/models/bert/tokenization_bert_legacy.py +5 -3
  154. transformers/models/bert_generation/configuration_bert_generation.py +2 -17
  155. transformers/models/bert_generation/modeling_bert_generation.py +49 -49
  156. transformers/models/bert_generation/tokenization_bert_generation.py +3 -2
  157. transformers/models/bert_japanese/tokenization_bert_japanese.py +6 -5
  158. transformers/models/bertweet/tokenization_bertweet.py +3 -1
  159. transformers/models/big_bird/configuration_big_bird.py +9 -12
  160. transformers/models/big_bird/modeling_big_bird.py +109 -116
  161. transformers/models/big_bird/tokenization_big_bird.py +43 -16
  162. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -9
  163. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +117 -130
  164. transformers/models/biogpt/configuration_biogpt.py +2 -8
  165. transformers/models/biogpt/modeling_biogpt.py +76 -72
  166. transformers/models/biogpt/modular_biogpt.py +66 -62
  167. transformers/models/biogpt/tokenization_biogpt.py +5 -3
  168. transformers/models/bit/configuration_bit.py +1 -0
  169. transformers/models/bit/image_processing_bit.py +24 -21
  170. transformers/models/bit/image_processing_bit_fast.py +1 -0
  171. transformers/models/bit/modeling_bit.py +12 -25
  172. transformers/models/bitnet/configuration_bitnet.py +28 -23
  173. transformers/models/bitnet/modeling_bitnet.py +39 -36
  174. transformers/models/bitnet/modular_bitnet.py +6 -4
  175. transformers/models/blenderbot/configuration_blenderbot.py +5 -8
  176. transformers/models/blenderbot/modeling_blenderbot.py +96 -77
  177. transformers/models/blenderbot/tokenization_blenderbot.py +24 -18
  178. transformers/models/blenderbot_small/configuration_blenderbot_small.py +5 -8
  179. transformers/models/blenderbot_small/modeling_blenderbot_small.py +69 -79
  180. transformers/models/blenderbot_small/tokenization_blenderbot_small.py +3 -1
  181. transformers/models/blip/configuration_blip.py +10 -9
  182. transformers/models/blip/image_processing_blip.py +20 -17
  183. transformers/models/blip/image_processing_blip_fast.py +1 -0
  184. transformers/models/blip/modeling_blip.py +108 -117
  185. transformers/models/blip/modeling_blip_text.py +65 -73
  186. transformers/models/blip/processing_blip.py +36 -5
  187. transformers/models/blip_2/configuration_blip_2.py +2 -2
  188. transformers/models/blip_2/modeling_blip_2.py +118 -146
  189. transformers/models/blip_2/processing_blip_2.py +38 -8
  190. transformers/models/bloom/configuration_bloom.py +2 -5
  191. transformers/models/bloom/modeling_bloom.py +104 -77
  192. transformers/models/blt/configuration_blt.py +86 -94
  193. transformers/models/blt/modeling_blt.py +81 -238
  194. transformers/models/blt/modular_blt.py +65 -228
  195. transformers/models/bridgetower/configuration_bridgetower.py +2 -7
  196. transformers/models/bridgetower/image_processing_bridgetower.py +35 -34
  197. transformers/models/bridgetower/image_processing_bridgetower_fast.py +16 -13
  198. transformers/models/bridgetower/modeling_bridgetower.py +119 -141
  199. transformers/models/bridgetower/processing_bridgetower.py +16 -2
  200. transformers/models/bros/configuration_bros.py +18 -24
  201. transformers/models/bros/modeling_bros.py +80 -90
  202. transformers/models/bros/processing_bros.py +12 -2
  203. transformers/models/byt5/tokenization_byt5.py +6 -4
  204. transformers/models/camembert/configuration_camembert.py +2 -8
  205. transformers/models/camembert/modeling_camembert.py +195 -196
  206. transformers/models/camembert/modular_camembert.py +54 -51
  207. transformers/models/camembert/tokenization_camembert.py +13 -6
  208. transformers/models/canine/configuration_canine.py +2 -4
  209. transformers/models/canine/modeling_canine.py +75 -84
  210. transformers/models/canine/tokenization_canine.py +1 -2
  211. transformers/models/chameleon/configuration_chameleon.py +34 -29
  212. transformers/models/chameleon/image_processing_chameleon.py +24 -21
  213. transformers/models/chameleon/image_processing_chameleon_fast.py +6 -5
  214. transformers/models/chameleon/modeling_chameleon.py +93 -142
  215. transformers/models/chameleon/processing_chameleon.py +41 -16
  216. transformers/models/chinese_clip/configuration_chinese_clip.py +8 -10
  217. transformers/models/chinese_clip/image_processing_chinese_clip.py +24 -21
  218. transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +1 -0
  219. transformers/models/chinese_clip/modeling_chinese_clip.py +92 -96
  220. transformers/models/chinese_clip/processing_chinese_clip.py +15 -2
  221. transformers/models/clap/configuration_clap.py +9 -4
  222. transformers/models/clap/feature_extraction_clap.py +12 -11
  223. transformers/models/clap/modeling_clap.py +123 -136
  224. transformers/models/clap/processing_clap.py +15 -2
  225. transformers/models/clip/configuration_clip.py +2 -4
  226. transformers/models/clip/image_processing_clip.py +24 -21
  227. transformers/models/clip/image_processing_clip_fast.py +1 -9
  228. transformers/models/clip/modeling_clip.py +65 -65
  229. transformers/models/clip/processing_clip.py +14 -2
  230. transformers/models/clip/tokenization_clip.py +46 -21
  231. transformers/models/clipseg/configuration_clipseg.py +2 -4
  232. transformers/models/clipseg/modeling_clipseg.py +109 -119
  233. transformers/models/clipseg/processing_clipseg.py +42 -19
  234. transformers/models/clvp/configuration_clvp.py +5 -15
  235. transformers/models/clvp/feature_extraction_clvp.py +10 -7
  236. transformers/models/clvp/modeling_clvp.py +146 -155
  237. transformers/models/clvp/number_normalizer.py +2 -1
  238. transformers/models/clvp/processing_clvp.py +20 -3
  239. transformers/models/clvp/tokenization_clvp.py +64 -1
  240. transformers/models/code_llama/tokenization_code_llama.py +44 -18
  241. transformers/models/codegen/configuration_codegen.py +4 -4
  242. transformers/models/codegen/modeling_codegen.py +53 -63
  243. transformers/models/codegen/tokenization_codegen.py +47 -17
  244. transformers/models/cohere/configuration_cohere.py +30 -25
  245. transformers/models/cohere/modeling_cohere.py +42 -40
  246. transformers/models/cohere/modular_cohere.py +29 -26
  247. transformers/models/cohere/tokenization_cohere.py +46 -15
  248. transformers/models/cohere2/configuration_cohere2.py +32 -31
  249. transformers/models/cohere2/modeling_cohere2.py +44 -42
  250. transformers/models/cohere2/modular_cohere2.py +54 -54
  251. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +14 -13
  252. transformers/models/cohere2_vision/modeling_cohere2_vision.py +58 -59
  253. transformers/models/cohere2_vision/modular_cohere2_vision.py +46 -45
  254. transformers/models/cohere2_vision/processing_cohere2_vision.py +36 -6
  255. transformers/models/colpali/configuration_colpali.py +1 -0
  256. transformers/models/colpali/modeling_colpali.py +16 -14
  257. transformers/models/colpali/modular_colpali.py +51 -11
  258. transformers/models/colpali/processing_colpali.py +52 -14
  259. transformers/models/colqwen2/modeling_colqwen2.py +28 -28
  260. transformers/models/colqwen2/modular_colqwen2.py +74 -37
  261. transformers/models/colqwen2/processing_colqwen2.py +52 -16
  262. transformers/models/conditional_detr/configuration_conditional_detr.py +2 -1
  263. transformers/models/conditional_detr/image_processing_conditional_detr.py +70 -67
  264. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +36 -36
  265. transformers/models/conditional_detr/modeling_conditional_detr.py +87 -99
  266. transformers/models/conditional_detr/modular_conditional_detr.py +3 -49
  267. transformers/models/convbert/configuration_convbert.py +8 -11
  268. transformers/models/convbert/modeling_convbert.py +87 -94
  269. transformers/models/convbert/tokenization_convbert.py +1 -0
  270. transformers/models/convnext/configuration_convnext.py +1 -0
  271. transformers/models/convnext/image_processing_convnext.py +23 -20
  272. transformers/models/convnext/image_processing_convnext_fast.py +21 -16
  273. transformers/models/convnext/modeling_convnext.py +12 -9
  274. transformers/models/convnextv2/configuration_convnextv2.py +1 -0
  275. transformers/models/convnextv2/modeling_convnextv2.py +12 -9
  276. transformers/models/cpm/tokenization_cpm.py +7 -6
  277. transformers/models/cpm/tokenization_cpm_fast.py +5 -3
  278. transformers/models/cpmant/configuration_cpmant.py +1 -4
  279. transformers/models/cpmant/modeling_cpmant.py +40 -38
  280. transformers/models/cpmant/tokenization_cpmant.py +3 -1
  281. transformers/models/csm/configuration_csm.py +66 -58
  282. transformers/models/csm/generation_csm.py +35 -31
  283. transformers/models/csm/modeling_csm.py +85 -85
  284. transformers/models/csm/modular_csm.py +58 -58
  285. transformers/models/csm/processing_csm.py +68 -25
  286. transformers/models/ctrl/configuration_ctrl.py +1 -16
  287. transformers/models/ctrl/modeling_ctrl.py +44 -54
  288. transformers/models/ctrl/tokenization_ctrl.py +1 -0
  289. transformers/models/cvt/configuration_cvt.py +1 -0
  290. transformers/models/cvt/modeling_cvt.py +16 -20
  291. transformers/models/cwm/__init__.py +1 -0
  292. transformers/models/cwm/configuration_cwm.py +12 -8
  293. transformers/models/cwm/modeling_cwm.py +39 -37
  294. transformers/models/cwm/modular_cwm.py +12 -10
  295. transformers/models/d_fine/configuration_d_fine.py +5 -7
  296. transformers/models/d_fine/modeling_d_fine.py +128 -138
  297. transformers/models/d_fine/modular_d_fine.py +18 -33
  298. transformers/models/dab_detr/configuration_dab_detr.py +3 -6
  299. transformers/models/dab_detr/modeling_dab_detr.py +75 -81
  300. transformers/models/dac/configuration_dac.py +1 -0
  301. transformers/models/dac/feature_extraction_dac.py +9 -6
  302. transformers/models/dac/modeling_dac.py +26 -24
  303. transformers/models/data2vec/configuration_data2vec_audio.py +2 -4
  304. transformers/models/data2vec/configuration_data2vec_text.py +3 -11
  305. transformers/models/data2vec/configuration_data2vec_vision.py +1 -0
  306. transformers/models/data2vec/modeling_data2vec_audio.py +56 -57
  307. transformers/models/data2vec/modeling_data2vec_text.py +93 -98
  308. transformers/models/data2vec/modeling_data2vec_vision.py +45 -49
  309. transformers/models/data2vec/modular_data2vec_audio.py +1 -6
  310. transformers/models/data2vec/modular_data2vec_text.py +54 -58
  311. transformers/models/dbrx/configuration_dbrx.py +22 -36
  312. transformers/models/dbrx/modeling_dbrx.py +45 -42
  313. transformers/models/dbrx/modular_dbrx.py +33 -31
  314. transformers/models/deberta/configuration_deberta.py +1 -6
  315. transformers/models/deberta/modeling_deberta.py +60 -64
  316. transformers/models/deberta/tokenization_deberta.py +21 -9
  317. transformers/models/deberta_v2/configuration_deberta_v2.py +1 -6
  318. transformers/models/deberta_v2/modeling_deberta_v2.py +65 -71
  319. transformers/models/deberta_v2/tokenization_deberta_v2.py +29 -11
  320. transformers/models/decision_transformer/configuration_decision_transformer.py +2 -3
  321. transformers/models/decision_transformer/modeling_decision_transformer.py +56 -60
  322. transformers/models/deepseek_v2/configuration_deepseek_v2.py +44 -39
  323. transformers/models/deepseek_v2/modeling_deepseek_v2.py +43 -43
  324. transformers/models/deepseek_v2/modular_deepseek_v2.py +49 -48
  325. transformers/models/deepseek_v3/configuration_deepseek_v3.py +45 -40
  326. transformers/models/deepseek_v3/modeling_deepseek_v3.py +42 -45
  327. transformers/models/deepseek_v3/modular_deepseek_v3.py +9 -14
  328. transformers/models/deepseek_vl/configuration_deepseek_vl.py +3 -2
  329. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +26 -25
  330. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +10 -10
  331. transformers/models/deepseek_vl/modeling_deepseek_vl.py +48 -57
  332. transformers/models/deepseek_vl/modular_deepseek_vl.py +43 -14
  333. transformers/models/deepseek_vl/processing_deepseek_vl.py +41 -10
  334. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +5 -3
  335. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +35 -35
  336. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +24 -20
  337. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +61 -109
  338. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +118 -146
  339. transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +44 -12
  340. transformers/models/deformable_detr/configuration_deformable_detr.py +3 -2
  341. transformers/models/deformable_detr/image_processing_deformable_detr.py +61 -59
  342. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +28 -28
  343. transformers/models/deformable_detr/modeling_deformable_detr.py +82 -88
  344. transformers/models/deformable_detr/modular_deformable_detr.py +3 -1
  345. transformers/models/deit/configuration_deit.py +1 -0
  346. transformers/models/deit/image_processing_deit.py +21 -18
  347. transformers/models/deit/image_processing_deit_fast.py +1 -0
  348. transformers/models/deit/modeling_deit.py +22 -24
  349. transformers/models/depth_anything/configuration_depth_anything.py +4 -2
  350. transformers/models/depth_anything/modeling_depth_anything.py +10 -10
  351. transformers/models/depth_pro/configuration_depth_pro.py +1 -0
  352. transformers/models/depth_pro/image_processing_depth_pro.py +23 -22
  353. transformers/models/depth_pro/image_processing_depth_pro_fast.py +10 -8
  354. transformers/models/depth_pro/modeling_depth_pro.py +27 -31
  355. transformers/models/detr/configuration_detr.py +2 -1
  356. transformers/models/detr/image_processing_detr.py +66 -64
  357. transformers/models/detr/image_processing_detr_fast.py +34 -33
  358. transformers/models/detr/modeling_detr.py +79 -95
  359. transformers/models/dia/configuration_dia.py +15 -9
  360. transformers/models/dia/feature_extraction_dia.py +9 -6
  361. transformers/models/dia/generation_dia.py +50 -48
  362. transformers/models/dia/modeling_dia.py +69 -78
  363. transformers/models/dia/modular_dia.py +56 -64
  364. transformers/models/dia/processing_dia.py +29 -39
  365. transformers/models/dia/tokenization_dia.py +6 -3
  366. transformers/models/diffllama/configuration_diffllama.py +30 -25
  367. transformers/models/diffllama/modeling_diffllama.py +49 -46
  368. transformers/models/diffllama/modular_diffllama.py +19 -17
  369. transformers/models/dinat/configuration_dinat.py +1 -0
  370. transformers/models/dinat/modeling_dinat.py +44 -47
  371. transformers/models/dinov2/configuration_dinov2.py +1 -0
  372. transformers/models/dinov2/modeling_dinov2.py +15 -15
  373. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +1 -1
  374. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +15 -16
  375. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +9 -9
  376. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +7 -4
  377. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +6 -3
  378. transformers/models/dinov3_vit/configuration_dinov3_vit.py +8 -5
  379. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +9 -7
  380. transformers/models/dinov3_vit/modeling_dinov3_vit.py +18 -19
  381. transformers/models/dinov3_vit/modular_dinov3_vit.py +15 -16
  382. transformers/models/distilbert/configuration_distilbert.py +2 -8
  383. transformers/models/distilbert/modeling_distilbert.py +55 -55
  384. transformers/models/distilbert/tokenization_distilbert.py +1 -13
  385. transformers/models/doge/__init__.py +1 -0
  386. transformers/models/doge/configuration_doge.py +32 -39
  387. transformers/models/doge/modeling_doge.py +49 -45
  388. transformers/models/doge/modular_doge.py +63 -71
  389. transformers/models/donut/configuration_donut_swin.py +1 -0
  390. transformers/models/donut/image_processing_donut.py +29 -26
  391. transformers/models/donut/image_processing_donut_fast.py +15 -9
  392. transformers/models/donut/modeling_donut_swin.py +58 -62
  393. transformers/models/donut/processing_donut.py +26 -5
  394. transformers/models/dots1/configuration_dots1.py +33 -41
  395. transformers/models/dots1/modeling_dots1.py +45 -54
  396. transformers/models/dots1/modular_dots1.py +4 -5
  397. transformers/models/dpr/configuration_dpr.py +2 -19
  398. transformers/models/dpr/modeling_dpr.py +39 -42
  399. transformers/models/dpr/tokenization_dpr.py +9 -19
  400. transformers/models/dpr/tokenization_dpr_fast.py +9 -7
  401. transformers/models/dpt/configuration_dpt.py +2 -1
  402. transformers/models/dpt/image_processing_dpt.py +66 -65
  403. transformers/models/dpt/image_processing_dpt_fast.py +20 -18
  404. transformers/models/dpt/modeling_dpt.py +30 -32
  405. transformers/models/dpt/modular_dpt.py +17 -15
  406. transformers/models/edgetam/configuration_edgetam.py +3 -2
  407. transformers/models/edgetam/modeling_edgetam.py +86 -86
  408. transformers/models/edgetam/modular_edgetam.py +26 -21
  409. transformers/models/edgetam_video/__init__.py +1 -0
  410. transformers/models/edgetam_video/configuration_edgetam_video.py +1 -0
  411. transformers/models/edgetam_video/modeling_edgetam_video.py +158 -169
  412. transformers/models/edgetam_video/modular_edgetam_video.py +37 -30
  413. transformers/models/efficientloftr/configuration_efficientloftr.py +5 -4
  414. transformers/models/efficientloftr/image_processing_efficientloftr.py +16 -14
  415. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +9 -9
  416. transformers/models/efficientloftr/modeling_efficientloftr.py +38 -59
  417. transformers/models/efficientloftr/modular_efficientloftr.py +3 -1
  418. transformers/models/efficientnet/configuration_efficientnet.py +1 -0
  419. transformers/models/efficientnet/image_processing_efficientnet.py +32 -28
  420. transformers/models/efficientnet/image_processing_efficientnet_fast.py +19 -17
  421. transformers/models/efficientnet/modeling_efficientnet.py +15 -19
  422. transformers/models/electra/configuration_electra.py +3 -13
  423. transformers/models/electra/modeling_electra.py +103 -108
  424. transformers/models/emu3/configuration_emu3.py +17 -13
  425. transformers/models/emu3/image_processing_emu3.py +39 -44
  426. transformers/models/emu3/modeling_emu3.py +108 -148
  427. transformers/models/emu3/modular_emu3.py +73 -115
  428. transformers/models/emu3/processing_emu3.py +43 -18
  429. transformers/models/encodec/configuration_encodec.py +4 -2
  430. transformers/models/encodec/feature_extraction_encodec.py +13 -10
  431. transformers/models/encodec/modeling_encodec.py +29 -39
  432. transformers/models/encoder_decoder/configuration_encoder_decoder.py +2 -12
  433. transformers/models/encoder_decoder/modeling_encoder_decoder.py +43 -37
  434. transformers/models/eomt/configuration_eomt.py +1 -0
  435. transformers/models/eomt/image_processing_eomt.py +56 -66
  436. transformers/models/eomt/image_processing_eomt_fast.py +33 -76
  437. transformers/models/eomt/modeling_eomt.py +18 -23
  438. transformers/models/eomt/modular_eomt.py +13 -18
  439. transformers/models/ernie/configuration_ernie.py +3 -24
  440. transformers/models/ernie/modeling_ernie.py +132 -127
  441. transformers/models/ernie/modular_ernie.py +103 -97
  442. transformers/models/ernie4_5/configuration_ernie4_5.py +27 -23
  443. transformers/models/ernie4_5/modeling_ernie4_5.py +38 -36
  444. transformers/models/ernie4_5/modular_ernie4_5.py +4 -3
  445. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +36 -32
  446. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +55 -56
  447. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +46 -18
  448. transformers/models/esm/configuration_esm.py +15 -11
  449. transformers/models/esm/modeling_esm.py +34 -38
  450. transformers/models/esm/modeling_esmfold.py +49 -53
  451. transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
  452. transformers/models/esm/openfold_utils/loss.py +2 -1
  453. transformers/models/esm/openfold_utils/protein.py +16 -15
  454. transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
  455. transformers/models/esm/tokenization_esm.py +4 -2
  456. transformers/models/evolla/configuration_evolla.py +40 -50
  457. transformers/models/evolla/modeling_evolla.py +66 -71
  458. transformers/models/evolla/modular_evolla.py +47 -53
  459. transformers/models/evolla/processing_evolla.py +35 -23
  460. transformers/models/exaone4/configuration_exaone4.py +25 -23
  461. transformers/models/exaone4/modeling_exaone4.py +38 -35
  462. transformers/models/exaone4/modular_exaone4.py +46 -44
  463. transformers/models/falcon/configuration_falcon.py +26 -31
  464. transformers/models/falcon/modeling_falcon.py +80 -82
  465. transformers/models/falcon_h1/configuration_falcon_h1.py +51 -45
  466. transformers/models/falcon_h1/modeling_falcon_h1.py +82 -85
  467. transformers/models/falcon_h1/modular_falcon_h1.py +51 -56
  468. transformers/models/falcon_mamba/configuration_falcon_mamba.py +2 -1
  469. transformers/models/falcon_mamba/modeling_falcon_mamba.py +82 -75
  470. transformers/models/falcon_mamba/modular_falcon_mamba.py +45 -28
  471. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +6 -2
  472. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +60 -76
  473. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +3 -2
  474. transformers/models/flaubert/configuration_flaubert.py +5 -10
  475. transformers/models/flaubert/modeling_flaubert.py +143 -145
  476. transformers/models/flaubert/tokenization_flaubert.py +5 -3
  477. transformers/models/flava/configuration_flava.py +6 -5
  478. transformers/models/flava/image_processing_flava.py +67 -66
  479. transformers/models/flava/image_processing_flava_fast.py +49 -46
  480. transformers/models/flava/modeling_flava.py +136 -153
  481. transformers/models/flava/processing_flava.py +12 -2
  482. transformers/models/flex_olmo/__init__.py +1 -0
  483. transformers/models/flex_olmo/configuration_flex_olmo.py +32 -28
  484. transformers/models/flex_olmo/modeling_flex_olmo.py +47 -47
  485. transformers/models/flex_olmo/modular_flex_olmo.py +44 -40
  486. transformers/models/florence2/configuration_florence2.py +1 -0
  487. transformers/models/florence2/modeling_florence2.py +69 -111
  488. transformers/models/florence2/modular_florence2.py +101 -104
  489. transformers/models/florence2/processing_florence2.py +47 -18
  490. transformers/models/fnet/configuration_fnet.py +2 -6
  491. transformers/models/fnet/modeling_fnet.py +80 -83
  492. transformers/models/fnet/tokenization_fnet.py +1 -0
  493. transformers/models/focalnet/configuration_focalnet.py +1 -0
  494. transformers/models/focalnet/modeling_focalnet.py +45 -51
  495. transformers/models/fsmt/configuration_fsmt.py +17 -12
  496. transformers/models/fsmt/modeling_fsmt.py +48 -49
  497. transformers/models/fsmt/tokenization_fsmt.py +5 -3
  498. transformers/models/funnel/configuration_funnel.py +1 -8
  499. transformers/models/funnel/modeling_funnel.py +93 -99
  500. transformers/models/funnel/tokenization_funnel.py +27 -17
  501. transformers/models/fuyu/configuration_fuyu.py +34 -28
  502. transformers/models/fuyu/image_processing_fuyu.py +31 -29
  503. transformers/models/fuyu/image_processing_fuyu_fast.py +17 -17
  504. transformers/models/fuyu/modeling_fuyu.py +53 -53
  505. transformers/models/fuyu/processing_fuyu.py +34 -23
  506. transformers/models/gemma/configuration_gemma.py +30 -25
  507. transformers/models/gemma/modeling_gemma.py +50 -46
  508. transformers/models/gemma/modular_gemma.py +47 -42
  509. transformers/models/gemma/tokenization_gemma.py +30 -10
  510. transformers/models/gemma2/configuration_gemma2.py +35 -30
  511. transformers/models/gemma2/modeling_gemma2.py +42 -39
  512. transformers/models/gemma2/modular_gemma2.py +66 -63
  513. transformers/models/gemma3/configuration_gemma3.py +44 -44
  514. transformers/models/gemma3/image_processing_gemma3.py +31 -29
  515. transformers/models/gemma3/image_processing_gemma3_fast.py +13 -11
  516. transformers/models/gemma3/modeling_gemma3.py +207 -159
  517. transformers/models/gemma3/modular_gemma3.py +204 -153
  518. transformers/models/gemma3/processing_gemma3.py +5 -5
  519. transformers/models/gemma3n/configuration_gemma3n.py +26 -36
  520. transformers/models/gemma3n/feature_extraction_gemma3n.py +11 -9
  521. transformers/models/gemma3n/modeling_gemma3n.py +356 -222
  522. transformers/models/gemma3n/modular_gemma3n.py +207 -230
  523. transformers/models/gemma3n/processing_gemma3n.py +26 -12
  524. transformers/models/git/configuration_git.py +8 -5
  525. transformers/models/git/modeling_git.py +204 -266
  526. transformers/models/git/processing_git.py +14 -2
  527. transformers/models/glm/configuration_glm.py +28 -24
  528. transformers/models/glm/modeling_glm.py +40 -37
  529. transformers/models/glm/modular_glm.py +7 -4
  530. transformers/models/glm4/configuration_glm4.py +28 -24
  531. transformers/models/glm4/modeling_glm4.py +42 -40
  532. transformers/models/glm4/modular_glm4.py +10 -8
  533. transformers/models/glm46v/configuration_glm46v.py +1 -0
  534. transformers/models/glm46v/image_processing_glm46v.py +40 -35
  535. transformers/models/glm46v/image_processing_glm46v_fast.py +9 -9
  536. transformers/models/glm46v/modeling_glm46v.py +90 -137
  537. transformers/models/glm46v/modular_glm46v.py +3 -4
  538. transformers/models/glm46v/processing_glm46v.py +41 -7
  539. transformers/models/glm46v/video_processing_glm46v.py +11 -9
  540. transformers/models/glm4_moe/configuration_glm4_moe.py +32 -40
  541. transformers/models/glm4_moe/modeling_glm4_moe.py +42 -45
  542. transformers/models/glm4_moe/modular_glm4_moe.py +34 -42
  543. transformers/models/glm4v/configuration_glm4v.py +20 -18
  544. transformers/models/glm4v/image_processing_glm4v.py +40 -34
  545. transformers/models/glm4v/image_processing_glm4v_fast.py +9 -8
  546. transformers/models/glm4v/modeling_glm4v.py +205 -254
  547. transformers/models/glm4v/modular_glm4v.py +224 -210
  548. transformers/models/glm4v/processing_glm4v.py +41 -7
  549. transformers/models/glm4v/video_processing_glm4v.py +11 -9
  550. transformers/models/glm4v_moe/configuration_glm4v_moe.py +125 -136
  551. transformers/models/glm4v_moe/modeling_glm4v_moe.py +368 -377
  552. transformers/models/glm4v_moe/modular_glm4v_moe.py +169 -83
  553. transformers/models/glpn/configuration_glpn.py +1 -0
  554. transformers/models/glpn/image_processing_glpn.py +12 -11
  555. transformers/models/glpn/image_processing_glpn_fast.py +13 -11
  556. transformers/models/glpn/modeling_glpn.py +14 -16
  557. transformers/models/got_ocr2/configuration_got_ocr2.py +12 -4
  558. transformers/models/got_ocr2/image_processing_got_ocr2.py +24 -22
  559. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +11 -9
  560. transformers/models/got_ocr2/modeling_got_ocr2.py +80 -77
  561. transformers/models/got_ocr2/modular_got_ocr2.py +51 -54
  562. transformers/models/got_ocr2/processing_got_ocr2.py +63 -42
  563. transformers/models/gpt2/configuration_gpt2.py +2 -13
  564. transformers/models/gpt2/modeling_gpt2.py +115 -120
  565. transformers/models/gpt2/tokenization_gpt2.py +46 -15
  566. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +2 -5
  567. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +89 -79
  568. transformers/models/gpt_neo/configuration_gpt_neo.py +2 -9
  569. transformers/models/gpt_neo/modeling_gpt_neo.py +67 -83
  570. transformers/models/gpt_neox/configuration_gpt_neox.py +25 -25
  571. transformers/models/gpt_neox/modeling_gpt_neox.py +75 -76
  572. transformers/models/gpt_neox/modular_gpt_neox.py +66 -67
  573. transformers/models/gpt_neox/tokenization_gpt_neox.py +51 -9
  574. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +19 -24
  575. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +47 -46
  576. transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +3 -1
  577. transformers/models/gpt_oss/configuration_gpt_oss.py +28 -46
  578. transformers/models/gpt_oss/modeling_gpt_oss.py +121 -83
  579. transformers/models/gpt_oss/modular_gpt_oss.py +103 -64
  580. transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
  581. transformers/models/gptj/configuration_gptj.py +4 -4
  582. transformers/models/gptj/modeling_gptj.py +87 -101
  583. transformers/models/granite/configuration_granite.py +33 -28
  584. transformers/models/granite/modeling_granite.py +46 -44
  585. transformers/models/granite/modular_granite.py +31 -29
  586. transformers/models/granite_speech/configuration_granite_speech.py +1 -0
  587. transformers/models/granite_speech/feature_extraction_granite_speech.py +3 -1
  588. transformers/models/granite_speech/modeling_granite_speech.py +52 -82
  589. transformers/models/granite_speech/processing_granite_speech.py +4 -11
  590. transformers/models/granitemoe/configuration_granitemoe.py +36 -31
  591. transformers/models/granitemoe/modeling_granitemoe.py +46 -41
  592. transformers/models/granitemoe/modular_granitemoe.py +27 -22
  593. transformers/models/granitemoehybrid/__init__.py +1 -0
  594. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +47 -46
  595. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +93 -97
  596. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +21 -54
  597. transformers/models/granitemoeshared/configuration_granitemoeshared.py +37 -33
  598. transformers/models/granitemoeshared/modeling_granitemoeshared.py +61 -54
  599. transformers/models/granitemoeshared/modular_granitemoeshared.py +21 -19
  600. transformers/models/grounding_dino/configuration_grounding_dino.py +4 -6
  601. transformers/models/grounding_dino/image_processing_grounding_dino.py +62 -60
  602. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +29 -28
  603. transformers/models/grounding_dino/modeling_grounding_dino.py +140 -155
  604. transformers/models/grounding_dino/modular_grounding_dino.py +3 -2
  605. transformers/models/grounding_dino/processing_grounding_dino.py +38 -10
  606. transformers/models/groupvit/configuration_groupvit.py +2 -4
  607. transformers/models/groupvit/modeling_groupvit.py +93 -107
  608. transformers/models/helium/configuration_helium.py +29 -25
  609. transformers/models/helium/modeling_helium.py +40 -38
  610. transformers/models/helium/modular_helium.py +7 -3
  611. transformers/models/herbert/tokenization_herbert.py +28 -10
  612. transformers/models/hgnet_v2/configuration_hgnet_v2.py +1 -0
  613. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -24
  614. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -24
  615. transformers/models/hiera/configuration_hiera.py +1 -0
  616. transformers/models/hiera/modeling_hiera.py +66 -72
  617. transformers/models/hubert/configuration_hubert.py +2 -4
  618. transformers/models/hubert/modeling_hubert.py +37 -42
  619. transformers/models/hubert/modular_hubert.py +11 -13
  620. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +31 -26
  621. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +38 -35
  622. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +6 -4
  623. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  624. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +36 -31
  625. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +42 -47
  626. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +9 -9
  627. transformers/models/ibert/configuration_ibert.py +2 -4
  628. transformers/models/ibert/modeling_ibert.py +62 -82
  629. transformers/models/ibert/quant_modules.py +1 -0
  630. transformers/models/idefics/configuration_idefics.py +8 -5
  631. transformers/models/idefics/image_processing_idefics.py +15 -13
  632. transformers/models/idefics/modeling_idefics.py +82 -75
  633. transformers/models/idefics/perceiver.py +3 -1
  634. transformers/models/idefics/processing_idefics.py +48 -32
  635. transformers/models/idefics/vision.py +25 -24
  636. transformers/models/idefics2/configuration_idefics2.py +3 -1
  637. transformers/models/idefics2/image_processing_idefics2.py +32 -31
  638. transformers/models/idefics2/image_processing_idefics2_fast.py +8 -8
  639. transformers/models/idefics2/modeling_idefics2.py +101 -127
  640. transformers/models/idefics2/processing_idefics2.py +68 -10
  641. transformers/models/idefics3/configuration_idefics3.py +4 -1
  642. transformers/models/idefics3/image_processing_idefics3.py +43 -42
  643. transformers/models/idefics3/image_processing_idefics3_fast.py +15 -40
  644. transformers/models/idefics3/modeling_idefics3.py +90 -115
  645. transformers/models/idefics3/processing_idefics3.py +69 -15
  646. transformers/models/ijepa/configuration_ijepa.py +1 -0
  647. transformers/models/ijepa/modeling_ijepa.py +11 -10
  648. transformers/models/ijepa/modular_ijepa.py +7 -5
  649. transformers/models/imagegpt/configuration_imagegpt.py +2 -9
  650. transformers/models/imagegpt/image_processing_imagegpt.py +18 -17
  651. transformers/models/imagegpt/image_processing_imagegpt_fast.py +16 -11
  652. transformers/models/imagegpt/modeling_imagegpt.py +65 -76
  653. transformers/models/informer/configuration_informer.py +9 -6
  654. transformers/models/informer/modeling_informer.py +86 -88
  655. transformers/models/informer/modular_informer.py +16 -14
  656. transformers/models/instructblip/configuration_instructblip.py +2 -2
  657. transformers/models/instructblip/modeling_instructblip.py +63 -103
  658. transformers/models/instructblip/processing_instructblip.py +36 -10
  659. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -2
  660. transformers/models/instructblipvideo/modeling_instructblipvideo.py +139 -157
  661. transformers/models/instructblipvideo/modular_instructblipvideo.py +64 -73
  662. transformers/models/instructblipvideo/processing_instructblipvideo.py +33 -14
  663. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +8 -6
  664. transformers/models/internvl/configuration_internvl.py +1 -0
  665. transformers/models/internvl/modeling_internvl.py +106 -85
  666. transformers/models/internvl/modular_internvl.py +67 -47
  667. transformers/models/internvl/processing_internvl.py +45 -12
  668. transformers/models/internvl/video_processing_internvl.py +12 -10
  669. transformers/models/jamba/configuration_jamba.py +8 -5
  670. transformers/models/jamba/modeling_jamba.py +66 -68
  671. transformers/models/jamba/modular_jamba.py +55 -54
  672. transformers/models/janus/configuration_janus.py +1 -0
  673. transformers/models/janus/image_processing_janus.py +37 -35
  674. transformers/models/janus/image_processing_janus_fast.py +20 -18
  675. transformers/models/janus/modeling_janus.py +191 -115
  676. transformers/models/janus/modular_janus.py +84 -133
  677. transformers/models/janus/processing_janus.py +43 -17
  678. transformers/models/jetmoe/configuration_jetmoe.py +26 -24
  679. transformers/models/jetmoe/modeling_jetmoe.py +46 -43
  680. transformers/models/jetmoe/modular_jetmoe.py +33 -31
  681. transformers/models/kosmos2/configuration_kosmos2.py +9 -10
  682. transformers/models/kosmos2/modeling_kosmos2.py +173 -208
  683. transformers/models/kosmos2/processing_kosmos2.py +55 -40
  684. transformers/models/kosmos2_5/__init__.py +1 -0
  685. transformers/models/kosmos2_5/configuration_kosmos2_5.py +9 -8
  686. transformers/models/kosmos2_5/image_processing_kosmos2_5.py +12 -10
  687. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +13 -4
  688. transformers/models/kosmos2_5/modeling_kosmos2_5.py +118 -132
  689. transformers/models/kosmos2_5/processing_kosmos2_5.py +29 -8
  690. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +28 -31
  691. transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +14 -12
  692. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +100 -110
  693. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +22 -28
  694. transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +8 -2
  695. transformers/models/layoutlm/configuration_layoutlm.py +2 -14
  696. transformers/models/layoutlm/modeling_layoutlm.py +72 -77
  697. transformers/models/layoutlmv2/configuration_layoutlmv2.py +17 -14
  698. transformers/models/layoutlmv2/image_processing_layoutlmv2.py +21 -18
  699. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +9 -7
  700. transformers/models/layoutlmv2/modeling_layoutlmv2.py +50 -64
  701. transformers/models/layoutlmv2/processing_layoutlmv2.py +44 -14
  702. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +126 -73
  703. transformers/models/layoutlmv3/configuration_layoutlmv3.py +19 -16
  704. transformers/models/layoutlmv3/image_processing_layoutlmv3.py +26 -24
  705. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +11 -9
  706. transformers/models/layoutlmv3/modeling_layoutlmv3.py +56 -82
  707. transformers/models/layoutlmv3/processing_layoutlmv3.py +46 -14
  708. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +134 -74
  709. transformers/models/layoutxlm/configuration_layoutxlm.py +17 -14
  710. transformers/models/layoutxlm/modular_layoutxlm.py +1 -0
  711. transformers/models/layoutxlm/processing_layoutxlm.py +44 -14
  712. transformers/models/layoutxlm/tokenization_layoutxlm.py +113 -77
  713. transformers/models/led/configuration_led.py +12 -8
  714. transformers/models/led/modeling_led.py +266 -124
  715. transformers/models/levit/configuration_levit.py +1 -0
  716. transformers/models/levit/image_processing_levit.py +21 -19
  717. transformers/models/levit/image_processing_levit_fast.py +5 -4
  718. transformers/models/levit/modeling_levit.py +19 -38
  719. transformers/models/lfm2/configuration_lfm2.py +30 -27
  720. transformers/models/lfm2/modeling_lfm2.py +50 -47
  721. transformers/models/lfm2/modular_lfm2.py +30 -29
  722. transformers/models/lfm2_moe/__init__.py +1 -0
  723. transformers/models/lfm2_moe/configuration_lfm2_moe.py +9 -6
  724. transformers/models/lfm2_moe/modeling_lfm2_moe.py +53 -61
  725. transformers/models/lfm2_moe/modular_lfm2_moe.py +37 -13
  726. transformers/models/lfm2_vl/configuration_lfm2_vl.py +1 -4
  727. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +12 -41
  728. transformers/models/lfm2_vl/modeling_lfm2_vl.py +66 -84
  729. transformers/models/lfm2_vl/modular_lfm2_vl.py +56 -70
  730. transformers/models/lfm2_vl/processing_lfm2_vl.py +76 -96
  731. transformers/models/lightglue/image_processing_lightglue.py +15 -16
  732. transformers/models/lightglue/image_processing_lightglue_fast.py +9 -9
  733. transformers/models/lightglue/modeling_lightglue.py +31 -31
  734. transformers/models/lightglue/modular_lightglue.py +28 -29
  735. transformers/models/lilt/configuration_lilt.py +2 -6
  736. transformers/models/lilt/modeling_lilt.py +70 -76
  737. transformers/models/llama/configuration_llama.py +31 -26
  738. transformers/models/llama/modeling_llama.py +39 -36
  739. transformers/models/llama/tokenization_llama.py +44 -14
  740. transformers/models/llama4/configuration_llama4.py +30 -27
  741. transformers/models/llama4/image_processing_llama4_fast.py +14 -12
  742. transformers/models/llama4/modeling_llama4.py +113 -120
  743. transformers/models/llama4/processing_llama4.py +57 -33
  744. transformers/models/llava/configuration_llava.py +1 -10
  745. transformers/models/llava/image_processing_llava.py +28 -25
  746. transformers/models/llava/image_processing_llava_fast.py +11 -9
  747. transformers/models/llava/modeling_llava.py +109 -85
  748. transformers/models/llava/processing_llava.py +51 -18
  749. transformers/models/llava_next/configuration_llava_next.py +2 -2
  750. transformers/models/llava_next/image_processing_llava_next.py +45 -43
  751. transformers/models/llava_next/image_processing_llava_next_fast.py +13 -11
  752. transformers/models/llava_next/modeling_llava_next.py +107 -110
  753. transformers/models/llava_next/processing_llava_next.py +47 -18
  754. transformers/models/llava_next_video/configuration_llava_next_video.py +7 -4
  755. transformers/models/llava_next_video/modeling_llava_next_video.py +158 -175
  756. transformers/models/llava_next_video/modular_llava_next_video.py +150 -155
  757. transformers/models/llava_next_video/processing_llava_next_video.py +63 -21
  758. transformers/models/llava_next_video/video_processing_llava_next_video.py +1 -0
  759. transformers/models/llava_onevision/configuration_llava_onevision.py +7 -4
  760. transformers/models/llava_onevision/image_processing_llava_onevision.py +42 -40
  761. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +15 -14
  762. transformers/models/llava_onevision/modeling_llava_onevision.py +169 -177
  763. transformers/models/llava_onevision/modular_llava_onevision.py +156 -163
  764. transformers/models/llava_onevision/processing_llava_onevision.py +53 -21
  765. transformers/models/llava_onevision/video_processing_llava_onevision.py +1 -0
  766. transformers/models/longcat_flash/__init__.py +1 -0
  767. transformers/models/longcat_flash/configuration_longcat_flash.py +42 -37
  768. transformers/models/longcat_flash/modeling_longcat_flash.py +36 -36
  769. transformers/models/longcat_flash/modular_longcat_flash.py +21 -21
  770. transformers/models/longformer/configuration_longformer.py +5 -5
  771. transformers/models/longformer/modeling_longformer.py +101 -105
  772. transformers/models/longt5/configuration_longt5.py +7 -9
  773. transformers/models/longt5/modeling_longt5.py +49 -49
  774. transformers/models/luke/configuration_luke.py +2 -8
  775. transformers/models/luke/modeling_luke.py +181 -188
  776. transformers/models/luke/tokenization_luke.py +140 -107
  777. transformers/models/lxmert/configuration_lxmert.py +1 -16
  778. transformers/models/lxmert/modeling_lxmert.py +74 -65
  779. transformers/models/m2m_100/configuration_m2m_100.py +9 -7
  780. transformers/models/m2m_100/modeling_m2m_100.py +71 -83
  781. transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
  782. transformers/models/mamba/configuration_mamba.py +2 -1
  783. transformers/models/mamba/modeling_mamba.py +66 -58
  784. transformers/models/mamba2/configuration_mamba2.py +8 -5
  785. transformers/models/mamba2/modeling_mamba2.py +69 -68
  786. transformers/models/marian/configuration_marian.py +5 -10
  787. transformers/models/marian/modeling_marian.py +87 -93
  788. transformers/models/marian/tokenization_marian.py +6 -6
  789. transformers/models/markuplm/configuration_markuplm.py +7 -4
  790. transformers/models/markuplm/feature_extraction_markuplm.py +2 -1
  791. transformers/models/markuplm/modeling_markuplm.py +70 -69
  792. transformers/models/markuplm/processing_markuplm.py +38 -31
  793. transformers/models/markuplm/tokenization_markuplm.py +136 -93
  794. transformers/models/mask2former/configuration_mask2former.py +8 -5
  795. transformers/models/mask2former/image_processing_mask2former.py +85 -84
  796. transformers/models/mask2former/image_processing_mask2former_fast.py +40 -37
  797. transformers/models/mask2former/modeling_mask2former.py +103 -118
  798. transformers/models/mask2former/modular_mask2former.py +8 -6
  799. transformers/models/maskformer/configuration_maskformer.py +9 -6
  800. transformers/models/maskformer/configuration_maskformer_swin.py +1 -0
  801. transformers/models/maskformer/image_processing_maskformer.py +85 -84
  802. transformers/models/maskformer/image_processing_maskformer_fast.py +40 -36
  803. transformers/models/maskformer/modeling_maskformer.py +65 -79
  804. transformers/models/maskformer/modeling_maskformer_swin.py +32 -36
  805. transformers/models/mbart/configuration_mbart.py +4 -9
  806. transformers/models/mbart/modeling_mbart.py +116 -131
  807. transformers/models/mbart/tokenization_mbart.py +54 -11
  808. transformers/models/mbart50/tokenization_mbart50.py +13 -8
  809. transformers/models/megatron_bert/configuration_megatron_bert.py +3 -13
  810. transformers/models/megatron_bert/modeling_megatron_bert.py +150 -148
  811. transformers/models/metaclip_2/configuration_metaclip_2.py +1 -4
  812. transformers/models/metaclip_2/modeling_metaclip_2.py +84 -91
  813. transformers/models/metaclip_2/modular_metaclip_2.py +45 -61
  814. transformers/models/mgp_str/configuration_mgp_str.py +1 -0
  815. transformers/models/mgp_str/modeling_mgp_str.py +18 -20
  816. transformers/models/mgp_str/processing_mgp_str.py +20 -3
  817. transformers/models/mgp_str/tokenization_mgp_str.py +3 -1
  818. transformers/models/mimi/configuration_mimi.py +40 -42
  819. transformers/models/mimi/modeling_mimi.py +113 -142
  820. transformers/models/minimax/__init__.py +1 -0
  821. transformers/models/minimax/configuration_minimax.py +43 -37
  822. transformers/models/minimax/modeling_minimax.py +51 -61
  823. transformers/models/minimax/modular_minimax.py +62 -68
  824. transformers/models/ministral/configuration_ministral.py +29 -25
  825. transformers/models/ministral/modeling_ministral.py +38 -36
  826. transformers/models/ministral/modular_ministral.py +37 -32
  827. transformers/models/ministral3/configuration_ministral3.py +27 -24
  828. transformers/models/ministral3/modeling_ministral3.py +37 -36
  829. transformers/models/ministral3/modular_ministral3.py +5 -4
  830. transformers/models/mistral/configuration_mistral.py +29 -24
  831. transformers/models/mistral/modeling_mistral.py +37 -36
  832. transformers/models/mistral/modular_mistral.py +12 -11
  833. transformers/models/mistral3/configuration_mistral3.py +1 -4
  834. transformers/models/mistral3/modeling_mistral3.py +86 -89
  835. transformers/models/mistral3/modular_mistral3.py +68 -69
  836. transformers/models/mixtral/configuration_mixtral.py +34 -29
  837. transformers/models/mixtral/modeling_mixtral.py +45 -50
  838. transformers/models/mixtral/modular_mixtral.py +31 -32
  839. transformers/models/mlcd/configuration_mlcd.py +1 -0
  840. transformers/models/mlcd/modeling_mlcd.py +14 -20
  841. transformers/models/mlcd/modular_mlcd.py +13 -17
  842. transformers/models/mllama/configuration_mllama.py +15 -10
  843. transformers/models/mllama/image_processing_mllama.py +25 -23
  844. transformers/models/mllama/image_processing_mllama_fast.py +11 -11
  845. transformers/models/mllama/modeling_mllama.py +94 -105
  846. transformers/models/mllama/processing_mllama.py +55 -6
  847. transformers/models/mluke/tokenization_mluke.py +107 -101
  848. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +3 -5
  849. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +140 -155
  850. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +3 -5
  851. transformers/models/mobilebert/configuration_mobilebert.py +2 -4
  852. transformers/models/mobilebert/modeling_mobilebert.py +85 -77
  853. transformers/models/mobilebert/tokenization_mobilebert.py +1 -0
  854. transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +1 -0
  855. transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +23 -20
  856. transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +1 -0
  857. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +16 -15
  858. transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +1 -0
  859. transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +51 -48
  860. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +15 -13
  861. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +22 -24
  862. transformers/models/mobilevit/configuration_mobilevit.py +1 -0
  863. transformers/models/mobilevit/image_processing_mobilevit.py +49 -46
  864. transformers/models/mobilevit/image_processing_mobilevit_fast.py +14 -12
  865. transformers/models/mobilevit/modeling_mobilevit.py +21 -28
  866. transformers/models/mobilevitv2/configuration_mobilevitv2.py +1 -0
  867. transformers/models/mobilevitv2/modeling_mobilevitv2.py +22 -28
  868. transformers/models/modernbert/configuration_modernbert.py +42 -44
  869. transformers/models/modernbert/modeling_modernbert.py +133 -145
  870. transformers/models/modernbert/modular_modernbert.py +170 -186
  871. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +40 -40
  872. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +57 -62
  873. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +86 -94
  874. transformers/models/moonshine/configuration_moonshine.py +31 -34
  875. transformers/models/moonshine/modeling_moonshine.py +71 -71
  876. transformers/models/moonshine/modular_moonshine.py +83 -88
  877. transformers/models/moshi/configuration_moshi.py +23 -46
  878. transformers/models/moshi/modeling_moshi.py +187 -157
  879. transformers/models/mpnet/configuration_mpnet.py +2 -6
  880. transformers/models/mpnet/modeling_mpnet.py +57 -62
  881. transformers/models/mpnet/tokenization_mpnet.py +15 -4
  882. transformers/models/mpt/configuration_mpt.py +9 -5
  883. transformers/models/mpt/modeling_mpt.py +60 -60
  884. transformers/models/mra/configuration_mra.py +2 -8
  885. transformers/models/mra/modeling_mra.py +57 -64
  886. transformers/models/mt5/configuration_mt5.py +8 -10
  887. transformers/models/mt5/modeling_mt5.py +95 -87
  888. transformers/models/musicgen/configuration_musicgen.py +8 -12
  889. transformers/models/musicgen/modeling_musicgen.py +122 -118
  890. transformers/models/musicgen/processing_musicgen.py +21 -3
  891. transformers/models/musicgen_melody/configuration_musicgen_melody.py +8 -15
  892. transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +9 -8
  893. transformers/models/musicgen_melody/modeling_musicgen_melody.py +123 -117
  894. transformers/models/musicgen_melody/processing_musicgen_melody.py +22 -3
  895. transformers/models/mvp/configuration_mvp.py +5 -8
  896. transformers/models/mvp/modeling_mvp.py +123 -135
  897. transformers/models/myt5/tokenization_myt5.py +10 -8
  898. transformers/models/nanochat/configuration_nanochat.py +8 -5
  899. transformers/models/nanochat/modeling_nanochat.py +40 -37
  900. transformers/models/nanochat/modular_nanochat.py +14 -12
  901. transformers/models/nemotron/configuration_nemotron.py +30 -25
  902. transformers/models/nemotron/modeling_nemotron.py +57 -56
  903. transformers/models/nllb/tokenization_nllb.py +28 -12
  904. transformers/models/nllb_moe/configuration_nllb_moe.py +9 -7
  905. transformers/models/nllb_moe/modeling_nllb_moe.py +69 -77
  906. transformers/models/nougat/image_processing_nougat.py +32 -29
  907. transformers/models/nougat/image_processing_nougat_fast.py +14 -12
  908. transformers/models/nougat/processing_nougat.py +39 -37
  909. transformers/models/nougat/tokenization_nougat.py +73 -18
  910. transformers/models/nystromformer/configuration_nystromformer.py +2 -8
  911. transformers/models/nystromformer/modeling_nystromformer.py +63 -74
  912. transformers/models/olmo/configuration_olmo.py +28 -23
  913. transformers/models/olmo/modeling_olmo.py +39 -36
  914. transformers/models/olmo/modular_olmo.py +11 -7
  915. transformers/models/olmo2/configuration_olmo2.py +28 -23
  916. transformers/models/olmo2/modeling_olmo2.py +41 -37
  917. transformers/models/olmo2/modular_olmo2.py +32 -29
  918. transformers/models/olmo3/__init__.py +1 -0
  919. transformers/models/olmo3/configuration_olmo3.py +30 -26
  920. transformers/models/olmo3/modeling_olmo3.py +39 -36
  921. transformers/models/olmo3/modular_olmo3.py +40 -37
  922. transformers/models/olmoe/configuration_olmoe.py +33 -29
  923. transformers/models/olmoe/modeling_olmoe.py +46 -52
  924. transformers/models/olmoe/modular_olmoe.py +15 -16
  925. transformers/models/omdet_turbo/configuration_omdet_turbo.py +4 -2
  926. transformers/models/omdet_turbo/modeling_omdet_turbo.py +47 -53
  927. transformers/models/omdet_turbo/processing_omdet_turbo.py +67 -19
  928. transformers/models/oneformer/configuration_oneformer.py +8 -5
  929. transformers/models/oneformer/image_processing_oneformer.py +84 -83
  930. transformers/models/oneformer/image_processing_oneformer_fast.py +42 -41
  931. transformers/models/oneformer/modeling_oneformer.py +171 -147
  932. transformers/models/oneformer/processing_oneformer.py +43 -28
  933. transformers/models/openai/configuration_openai.py +1 -16
  934. transformers/models/openai/modeling_openai.py +51 -65
  935. transformers/models/openai/tokenization_openai.py +47 -8
  936. transformers/models/opt/configuration_opt.py +7 -6
  937. transformers/models/opt/modeling_opt.py +76 -78
  938. transformers/models/ovis2/__init__.py +1 -0
  939. transformers/models/ovis2/configuration_ovis2.py +1 -0
  940. transformers/models/ovis2/image_processing_ovis2.py +24 -22
  941. transformers/models/ovis2/image_processing_ovis2_fast.py +11 -9
  942. transformers/models/ovis2/modeling_ovis2.py +142 -111
  943. transformers/models/ovis2/modular_ovis2.py +45 -90
  944. transformers/models/ovis2/processing_ovis2.py +40 -12
  945. transformers/models/owlv2/configuration_owlv2.py +2 -4
  946. transformers/models/owlv2/image_processing_owlv2.py +21 -20
  947. transformers/models/owlv2/image_processing_owlv2_fast.py +15 -12
  948. transformers/models/owlv2/modeling_owlv2.py +117 -133
  949. transformers/models/owlv2/modular_owlv2.py +14 -11
  950. transformers/models/owlv2/processing_owlv2.py +49 -20
  951. transformers/models/owlvit/configuration_owlvit.py +2 -4
  952. transformers/models/owlvit/image_processing_owlvit.py +22 -21
  953. transformers/models/owlvit/image_processing_owlvit_fast.py +3 -2
  954. transformers/models/owlvit/modeling_owlvit.py +116 -132
  955. transformers/models/owlvit/processing_owlvit.py +48 -20
  956. transformers/models/paligemma/configuration_paligemma.py +1 -4
  957. transformers/models/paligemma/modeling_paligemma.py +93 -103
  958. transformers/models/paligemma/processing_paligemma.py +66 -13
  959. transformers/models/parakeet/configuration_parakeet.py +14 -7
  960. transformers/models/parakeet/feature_extraction_parakeet.py +12 -10
  961. transformers/models/parakeet/modeling_parakeet.py +28 -32
  962. transformers/models/parakeet/modular_parakeet.py +20 -23
  963. transformers/models/parakeet/processing_parakeet.py +5 -13
  964. transformers/models/parakeet/{tokenization_parakeet.py → tokenization_parakeet_fast.py} +7 -5
  965. transformers/models/patchtsmixer/configuration_patchtsmixer.py +8 -5
  966. transformers/models/patchtsmixer/modeling_patchtsmixer.py +62 -70
  967. transformers/models/patchtst/configuration_patchtst.py +9 -6
  968. transformers/models/patchtst/modeling_patchtst.py +80 -97
  969. transformers/models/pegasus/configuration_pegasus.py +5 -8
  970. transformers/models/pegasus/modeling_pegasus.py +66 -72
  971. transformers/models/pegasus/tokenization_pegasus.py +45 -15
  972. transformers/models/pegasus_x/configuration_pegasus_x.py +4 -5
  973. transformers/models/pegasus_x/modeling_pegasus_x.py +52 -55
  974. transformers/models/perceiver/configuration_perceiver.py +1 -0
  975. transformers/models/perceiver/image_processing_perceiver.py +25 -22
  976. transformers/models/perceiver/image_processing_perceiver_fast.py +9 -7
  977. transformers/models/perceiver/modeling_perceiver.py +146 -165
  978. transformers/models/perceiver/tokenization_perceiver.py +6 -3
  979. transformers/models/perception_lm/configuration_perception_lm.py +1 -0
  980. transformers/models/perception_lm/image_processing_perception_lm_fast.py +10 -8
  981. transformers/models/perception_lm/modeling_perception_lm.py +70 -71
  982. transformers/models/perception_lm/modular_perception_lm.py +61 -65
  983. transformers/models/perception_lm/processing_perception_lm.py +47 -13
  984. transformers/models/perception_lm/video_processing_perception_lm.py +1 -0
  985. transformers/models/persimmon/configuration_persimmon.py +28 -23
  986. transformers/models/persimmon/modeling_persimmon.py +45 -43
  987. transformers/models/phi/configuration_phi.py +28 -23
  988. transformers/models/phi/modeling_phi.py +43 -40
  989. transformers/models/phi/modular_phi.py +24 -23
  990. transformers/models/phi3/configuration_phi3.py +33 -28
  991. transformers/models/phi3/modeling_phi3.py +38 -36
  992. transformers/models/phi3/modular_phi3.py +17 -13
  993. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +33 -30
  994. transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +9 -7
  995. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +11 -11
  996. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +78 -95
  997. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +80 -98
  998. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +44 -7
  999. transformers/models/phimoe/configuration_phimoe.py +36 -31
  1000. transformers/models/phimoe/modeling_phimoe.py +45 -50
  1001. transformers/models/phimoe/modular_phimoe.py +4 -3
  1002. transformers/models/phobert/tokenization_phobert.py +6 -4
  1003. transformers/models/pix2struct/configuration_pix2struct.py +10 -12
  1004. transformers/models/pix2struct/image_processing_pix2struct.py +19 -15
  1005. transformers/models/pix2struct/image_processing_pix2struct_fast.py +15 -12
  1006. transformers/models/pix2struct/modeling_pix2struct.py +52 -58
  1007. transformers/models/pix2struct/processing_pix2struct.py +30 -5
  1008. transformers/models/pixtral/configuration_pixtral.py +14 -11
  1009. transformers/models/pixtral/image_processing_pixtral.py +28 -26
  1010. transformers/models/pixtral/image_processing_pixtral_fast.py +11 -10
  1011. transformers/models/pixtral/modeling_pixtral.py +34 -28
  1012. transformers/models/pixtral/processing_pixtral.py +53 -21
  1013. transformers/models/plbart/configuration_plbart.py +5 -8
  1014. transformers/models/plbart/modeling_plbart.py +106 -119
  1015. transformers/models/plbart/modular_plbart.py +33 -39
  1016. transformers/models/plbart/tokenization_plbart.py +7 -4
  1017. transformers/models/poolformer/configuration_poolformer.py +1 -0
  1018. transformers/models/poolformer/image_processing_poolformer.py +24 -21
  1019. transformers/models/poolformer/image_processing_poolformer_fast.py +15 -13
  1020. transformers/models/poolformer/modeling_poolformer.py +13 -23
  1021. transformers/models/pop2piano/configuration_pop2piano.py +8 -7
  1022. transformers/models/pop2piano/feature_extraction_pop2piano.py +9 -6
  1023. transformers/models/pop2piano/modeling_pop2piano.py +24 -26
  1024. transformers/models/pop2piano/processing_pop2piano.py +33 -25
  1025. transformers/models/pop2piano/tokenization_pop2piano.py +23 -15
  1026. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +3 -3
  1027. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
  1028. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +21 -20
  1029. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +13 -16
  1030. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +13 -16
  1031. transformers/models/prophetnet/configuration_prophetnet.py +38 -37
  1032. transformers/models/prophetnet/modeling_prophetnet.py +131 -114
  1033. transformers/models/prophetnet/tokenization_prophetnet.py +16 -14
  1034. transformers/models/pvt/configuration_pvt.py +1 -0
  1035. transformers/models/pvt/image_processing_pvt.py +27 -24
  1036. transformers/models/pvt/image_processing_pvt_fast.py +2 -1
  1037. transformers/models/pvt/modeling_pvt.py +21 -21
  1038. transformers/models/pvt_v2/configuration_pvt_v2.py +4 -2
  1039. transformers/models/pvt_v2/modeling_pvt_v2.py +25 -28
  1040. transformers/models/qwen2/configuration_qwen2.py +25 -32
  1041. transformers/models/qwen2/modeling_qwen2.py +38 -36
  1042. transformers/models/qwen2/modular_qwen2.py +12 -11
  1043. transformers/models/qwen2/tokenization_qwen2.py +23 -12
  1044. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +26 -32
  1045. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +277 -340
  1046. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +211 -278
  1047. transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +49 -41
  1048. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +35 -29
  1049. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +148 -203
  1050. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +118 -93
  1051. transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +43 -7
  1052. transformers/models/qwen2_audio/configuration_qwen2_audio.py +1 -0
  1053. transformers/models/qwen2_audio/modeling_qwen2_audio.py +40 -40
  1054. transformers/models/qwen2_audio/processing_qwen2_audio.py +42 -13
  1055. transformers/models/qwen2_moe/configuration_qwen2_moe.py +35 -42
  1056. transformers/models/qwen2_moe/modeling_qwen2_moe.py +46 -51
  1057. transformers/models/qwen2_moe/modular_qwen2_moe.py +10 -7
  1058. transformers/models/qwen2_vl/configuration_qwen2_vl.py +34 -29
  1059. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +42 -41
  1060. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +15 -12
  1061. transformers/models/qwen2_vl/modeling_qwen2_vl.py +153 -199
  1062. transformers/models/qwen2_vl/processing_qwen2_vl.py +44 -7
  1063. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +18 -38
  1064. transformers/models/qwen3/configuration_qwen3.py +27 -34
  1065. transformers/models/qwen3/modeling_qwen3.py +39 -36
  1066. transformers/models/qwen3/modular_qwen3.py +6 -4
  1067. transformers/models/qwen3_moe/configuration_qwen3_moe.py +32 -39
  1068. transformers/models/qwen3_moe/modeling_qwen3_moe.py +46 -51
  1069. transformers/models/qwen3_moe/modular_qwen3_moe.py +13 -10
  1070. transformers/models/qwen3_next/configuration_qwen3_next.py +35 -45
  1071. transformers/models/qwen3_next/modeling_qwen3_next.py +51 -47
  1072. transformers/models/qwen3_next/modular_qwen3_next.py +35 -34
  1073. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +101 -135
  1074. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +252 -355
  1075. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +196 -250
  1076. transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +48 -40
  1077. transformers/models/qwen3_vl/configuration_qwen3_vl.py +29 -27
  1078. transformers/models/qwen3_vl/modeling_qwen3_vl.py +155 -233
  1079. transformers/models/qwen3_vl/modular_qwen3_vl.py +179 -206
  1080. transformers/models/qwen3_vl/processing_qwen3_vl.py +42 -6
  1081. transformers/models/qwen3_vl/video_processing_qwen3_vl.py +12 -10
  1082. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +30 -23
  1083. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +303 -358
  1084. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +124 -87
  1085. transformers/models/rag/configuration_rag.py +15 -6
  1086. transformers/models/rag/modeling_rag.py +130 -127
  1087. transformers/models/rag/retrieval_rag.py +5 -3
  1088. transformers/models/rag/tokenization_rag.py +50 -0
  1089. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +30 -29
  1090. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +42 -53
  1091. transformers/models/reformer/configuration_reformer.py +8 -7
  1092. transformers/models/reformer/modeling_reformer.py +69 -80
  1093. transformers/models/reformer/tokenization_reformer.py +31 -11
  1094. transformers/models/regnet/configuration_regnet.py +1 -0
  1095. transformers/models/regnet/modeling_regnet.py +8 -15
  1096. transformers/models/rembert/configuration_rembert.py +2 -8
  1097. transformers/models/rembert/modeling_rembert.py +111 -121
  1098. transformers/models/rembert/tokenization_rembert.py +12 -2
  1099. transformers/models/resnet/configuration_resnet.py +1 -0
  1100. transformers/models/resnet/modeling_resnet.py +13 -27
  1101. transformers/models/roberta/configuration_roberta.py +3 -11
  1102. transformers/models/roberta/modeling_roberta.py +93 -94
  1103. transformers/models/roberta/modular_roberta.py +58 -58
  1104. transformers/models/roberta/tokenization_roberta.py +29 -17
  1105. transformers/models/roberta/tokenization_roberta_old.py +4 -2
  1106. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +3 -11
  1107. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +93 -94
  1108. transformers/models/roc_bert/configuration_roc_bert.py +2 -8
  1109. transformers/models/roc_bert/modeling_roc_bert.py +121 -122
  1110. transformers/models/roc_bert/tokenization_roc_bert.py +94 -88
  1111. transformers/models/roformer/configuration_roformer.py +3 -13
  1112. transformers/models/roformer/modeling_roformer.py +81 -85
  1113. transformers/models/roformer/tokenization_roformer.py +412 -74
  1114. transformers/models/roformer/tokenization_roformer_fast.py +160 -0
  1115. transformers/models/roformer/tokenization_utils.py +1 -0
  1116. transformers/models/rt_detr/configuration_rt_detr.py +2 -1
  1117. transformers/models/rt_detr/configuration_rt_detr_resnet.py +1 -0
  1118. transformers/models/rt_detr/image_processing_rt_detr.py +55 -54
  1119. transformers/models/rt_detr/image_processing_rt_detr_fast.py +26 -26
  1120. transformers/models/rt_detr/modeling_rt_detr.py +90 -99
  1121. transformers/models/rt_detr/modeling_rt_detr_resnet.py +6 -13
  1122. transformers/models/rt_detr/modular_rt_detr.py +16 -16
  1123. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +4 -6
  1124. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +90 -101
  1125. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +12 -19
  1126. transformers/models/rwkv/configuration_rwkv.py +4 -2
  1127. transformers/models/rwkv/modeling_rwkv.py +32 -31
  1128. transformers/models/sam/configuration_sam.py +1 -3
  1129. transformers/models/sam/image_processing_sam.py +60 -59
  1130. transformers/models/sam/image_processing_sam_fast.py +27 -25
  1131. transformers/models/sam/modeling_sam.py +41 -47
  1132. transformers/models/sam/processing_sam.py +27 -39
  1133. transformers/models/sam2/configuration_sam2.py +3 -2
  1134. transformers/models/sam2/image_processing_sam2_fast.py +15 -14
  1135. transformers/models/sam2/modeling_sam2.py +90 -96
  1136. transformers/models/sam2/modular_sam2.py +91 -86
  1137. transformers/models/sam2/processing_sam2.py +47 -31
  1138. transformers/models/sam2_video/configuration_sam2_video.py +1 -0
  1139. transformers/models/sam2_video/modeling_sam2_video.py +144 -151
  1140. transformers/models/sam2_video/modular_sam2_video.py +104 -101
  1141. transformers/models/sam2_video/processing_sam2_video.py +66 -49
  1142. transformers/models/sam2_video/video_processing_sam2_video.py +4 -1
  1143. transformers/models/sam3/configuration_sam3.py +2 -21
  1144. transformers/models/sam3/image_processing_sam3_fast.py +20 -17
  1145. transformers/models/sam3/modeling_sam3.py +170 -184
  1146. transformers/models/sam3/modular_sam3.py +8 -3
  1147. transformers/models/sam3/processing_sam3.py +52 -37
  1148. transformers/models/sam3_tracker/__init__.py +1 -0
  1149. transformers/models/sam3_tracker/configuration_sam3_tracker.py +3 -1
  1150. transformers/models/sam3_tracker/modeling_sam3_tracker.py +77 -82
  1151. transformers/models/sam3_tracker/modular_sam3_tracker.py +3 -8
  1152. transformers/models/sam3_tracker/processing_sam3_tracker.py +48 -31
  1153. transformers/models/sam3_tracker_video/__init__.py +1 -0
  1154. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +1 -25
  1155. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +122 -135
  1156. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +26 -35
  1157. transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +66 -50
  1158. transformers/models/sam3_video/configuration_sam3_video.py +1 -14
  1159. transformers/models/sam3_video/modeling_sam3_video.py +34 -33
  1160. transformers/models/sam3_video/processing_sam3_video.py +46 -26
  1161. transformers/models/sam_hq/__init__.py +1 -1
  1162. transformers/models/sam_hq/configuration_sam_hq.py +1 -3
  1163. transformers/models/sam_hq/modeling_sam_hq.py +69 -74
  1164. transformers/models/sam_hq/modular_sam_hq.py +25 -23
  1165. transformers/models/sam_hq/{processing_sam_hq.py → processing_samhq.py} +29 -41
  1166. transformers/models/seamless_m4t/configuration_seamless_m4t.py +10 -8
  1167. transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +11 -8
  1168. transformers/models/seamless_m4t/modeling_seamless_m4t.py +194 -212
  1169. transformers/models/seamless_m4t/processing_seamless_m4t.py +39 -18
  1170. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +77 -40
  1171. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +10 -8
  1172. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +196 -204
  1173. transformers/models/seed_oss/configuration_seed_oss.py +32 -28
  1174. transformers/models/seed_oss/modeling_seed_oss.py +35 -33
  1175. transformers/models/seed_oss/modular_seed_oss.py +4 -3
  1176. transformers/models/segformer/configuration_segformer.py +10 -0
  1177. transformers/models/segformer/image_processing_segformer.py +42 -39
  1178. transformers/models/segformer/image_processing_segformer_fast.py +12 -10
  1179. transformers/models/segformer/modeling_segformer.py +31 -34
  1180. transformers/models/segformer/modular_segformer.py +10 -8
  1181. transformers/models/seggpt/configuration_seggpt.py +1 -0
  1182. transformers/models/seggpt/image_processing_seggpt.py +41 -38
  1183. transformers/models/seggpt/modeling_seggpt.py +38 -50
  1184. transformers/models/sew/configuration_sew.py +2 -4
  1185. transformers/models/sew/modeling_sew.py +36 -38
  1186. transformers/models/sew/modular_sew.py +13 -13
  1187. transformers/models/sew_d/configuration_sew_d.py +2 -4
  1188. transformers/models/sew_d/modeling_sew_d.py +30 -31
  1189. transformers/models/shieldgemma2/configuration_shieldgemma2.py +1 -0
  1190. transformers/models/shieldgemma2/modeling_shieldgemma2.py +17 -16
  1191. transformers/models/shieldgemma2/processing_shieldgemma2.py +5 -3
  1192. transformers/models/siglip/configuration_siglip.py +2 -4
  1193. transformers/models/siglip/image_processing_siglip.py +20 -17
  1194. transformers/models/siglip/image_processing_siglip_fast.py +1 -0
  1195. transformers/models/siglip/modeling_siglip.py +75 -84
  1196. transformers/models/siglip/processing_siglip.py +14 -2
  1197. transformers/models/siglip/tokenization_siglip.py +7 -6
  1198. transformers/models/siglip2/configuration_siglip2.py +2 -5
  1199. transformers/models/siglip2/image_processing_siglip2.py +16 -15
  1200. transformers/models/siglip2/image_processing_siglip2_fast.py +7 -6
  1201. transformers/models/siglip2/modeling_siglip2.py +129 -143
  1202. transformers/models/siglip2/modular_siglip2.py +46 -47
  1203. transformers/models/siglip2/processing_siglip2.py +14 -2
  1204. transformers/models/smollm3/configuration_smollm3.py +32 -29
  1205. transformers/models/smollm3/modeling_smollm3.py +39 -36
  1206. transformers/models/smollm3/modular_smollm3.py +35 -33
  1207. transformers/models/smolvlm/configuration_smolvlm.py +4 -2
  1208. transformers/models/smolvlm/image_processing_smolvlm.py +43 -42
  1209. transformers/models/smolvlm/image_processing_smolvlm_fast.py +15 -41
  1210. transformers/models/smolvlm/modeling_smolvlm.py +94 -126
  1211. transformers/models/smolvlm/modular_smolvlm.py +39 -50
  1212. transformers/models/smolvlm/processing_smolvlm.py +83 -15
  1213. transformers/models/smolvlm/video_processing_smolvlm.py +18 -16
  1214. transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +1 -0
  1215. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +27 -26
  1216. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -9
  1217. transformers/models/speech_to_text/feature_extraction_speech_to_text.py +13 -10
  1218. transformers/models/speech_to_text/modeling_speech_to_text.py +54 -66
  1219. transformers/models/speech_to_text/processing_speech_to_text.py +30 -4
  1220. transformers/models/speech_to_text/tokenization_speech_to_text.py +6 -5
  1221. transformers/models/speecht5/configuration_speecht5.py +9 -7
  1222. transformers/models/speecht5/feature_extraction_speecht5.py +37 -16
  1223. transformers/models/speecht5/modeling_speecht5.py +175 -213
  1224. transformers/models/speecht5/number_normalizer.py +1 -0
  1225. transformers/models/speecht5/processing_speecht5.py +37 -3
  1226. transformers/models/speecht5/tokenization_speecht5.py +5 -4
  1227. transformers/models/splinter/configuration_splinter.py +7 -6
  1228. transformers/models/splinter/modeling_splinter.py +59 -71
  1229. transformers/models/splinter/tokenization_splinter.py +30 -9
  1230. transformers/models/squeezebert/configuration_squeezebert.py +2 -14
  1231. transformers/models/squeezebert/modeling_squeezebert.py +62 -68
  1232. transformers/models/squeezebert/tokenization_squeezebert.py +1 -0
  1233. transformers/models/stablelm/configuration_stablelm.py +29 -24
  1234. transformers/models/stablelm/modeling_stablelm.py +45 -44
  1235. transformers/models/starcoder2/configuration_starcoder2.py +27 -30
  1236. transformers/models/starcoder2/modeling_starcoder2.py +41 -39
  1237. transformers/models/starcoder2/modular_starcoder2.py +16 -14
  1238. transformers/models/superglue/configuration_superglue.py +3 -7
  1239. transformers/models/superglue/image_processing_superglue.py +15 -15
  1240. transformers/models/superglue/image_processing_superglue_fast.py +10 -9
  1241. transformers/models/superglue/modeling_superglue.py +37 -42
  1242. transformers/models/superpoint/image_processing_superpoint.py +15 -15
  1243. transformers/models/superpoint/image_processing_superpoint_fast.py +11 -8
  1244. transformers/models/superpoint/modeling_superpoint.py +16 -18
  1245. transformers/models/swiftformer/configuration_swiftformer.py +1 -0
  1246. transformers/models/swiftformer/modeling_swiftformer.py +14 -18
  1247. transformers/models/swin/configuration_swin.py +1 -0
  1248. transformers/models/swin/modeling_swin.py +86 -86
  1249. transformers/models/swin2sr/configuration_swin2sr.py +1 -0
  1250. transformers/models/swin2sr/image_processing_swin2sr.py +13 -10
  1251. transformers/models/swin2sr/image_processing_swin2sr_fast.py +8 -4
  1252. transformers/models/swin2sr/modeling_swin2sr.py +63 -81
  1253. transformers/models/swinv2/configuration_swinv2.py +1 -0
  1254. transformers/models/swinv2/modeling_swinv2.py +104 -108
  1255. transformers/models/switch_transformers/configuration_switch_transformers.py +7 -11
  1256. transformers/models/switch_transformers/modeling_switch_transformers.py +44 -37
  1257. transformers/models/switch_transformers/modular_switch_transformers.py +41 -34
  1258. transformers/models/t5/configuration_t5.py +8 -14
  1259. transformers/models/t5/modeling_t5.py +92 -88
  1260. transformers/models/t5/tokenization_t5.py +9 -3
  1261. transformers/models/t5gemma/configuration_t5gemma.py +41 -43
  1262. transformers/models/t5gemma/modeling_t5gemma.py +107 -104
  1263. transformers/models/t5gemma/modular_t5gemma.py +120 -124
  1264. transformers/models/t5gemma2/configuration_t5gemma2.py +120 -80
  1265. transformers/models/t5gemma2/modeling_t5gemma2.py +125 -141
  1266. transformers/models/t5gemma2/modular_t5gemma2.py +104 -393
  1267. transformers/models/table_transformer/configuration_table_transformer.py +2 -1
  1268. transformers/models/table_transformer/modeling_table_transformer.py +49 -51
  1269. transformers/models/tapas/configuration_tapas.py +2 -12
  1270. transformers/models/tapas/modeling_tapas.py +67 -68
  1271. transformers/models/tapas/tokenization_tapas.py +153 -115
  1272. transformers/models/textnet/configuration_textnet.py +1 -0
  1273. transformers/models/textnet/image_processing_textnet.py +25 -22
  1274. transformers/models/textnet/image_processing_textnet_fast.py +10 -8
  1275. transformers/models/textnet/modeling_textnet.py +16 -28
  1276. transformers/models/time_series_transformer/configuration_time_series_transformer.py +8 -5
  1277. transformers/models/time_series_transformer/modeling_time_series_transformer.py +81 -83
  1278. transformers/models/timesfm/configuration_timesfm.py +1 -0
  1279. transformers/models/timesfm/modeling_timesfm.py +22 -33
  1280. transformers/models/timesfm/modular_timesfm.py +21 -32
  1281. transformers/models/timesformer/configuration_timesformer.py +1 -0
  1282. transformers/models/timesformer/modeling_timesformer.py +16 -15
  1283. transformers/models/timm_backbone/configuration_timm_backbone.py +1 -0
  1284. transformers/models/timm_backbone/modeling_timm_backbone.py +15 -17
  1285. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -5
  1286. transformers/models/timm_wrapper/image_processing_timm_wrapper.py +5 -4
  1287. transformers/models/timm_wrapper/modeling_timm_wrapper.py +29 -34
  1288. transformers/models/trocr/configuration_trocr.py +8 -11
  1289. transformers/models/trocr/modeling_trocr.py +44 -45
  1290. transformers/models/trocr/processing_trocr.py +25 -5
  1291. transformers/models/tvp/configuration_tvp.py +2 -5
  1292. transformers/models/tvp/image_processing_tvp.py +52 -50
  1293. transformers/models/tvp/image_processing_tvp_fast.py +15 -15
  1294. transformers/models/tvp/modeling_tvp.py +27 -27
  1295. transformers/models/tvp/processing_tvp.py +14 -2
  1296. transformers/models/udop/configuration_udop.py +7 -16
  1297. transformers/models/udop/modeling_udop.py +73 -71
  1298. transformers/models/udop/processing_udop.py +26 -7
  1299. transformers/models/udop/tokenization_udop.py +105 -84
  1300. transformers/models/umt5/configuration_umt5.py +7 -8
  1301. transformers/models/umt5/modeling_umt5.py +90 -94
  1302. transformers/models/unispeech/configuration_unispeech.py +2 -4
  1303. transformers/models/unispeech/modeling_unispeech.py +49 -51
  1304. transformers/models/unispeech/modular_unispeech.py +22 -22
  1305. transformers/models/unispeech_sat/configuration_unispeech_sat.py +2 -4
  1306. transformers/models/unispeech_sat/modeling_unispeech_sat.py +65 -69
  1307. transformers/models/unispeech_sat/modular_unispeech_sat.py +23 -23
  1308. transformers/models/univnet/feature_extraction_univnet.py +14 -14
  1309. transformers/models/univnet/modeling_univnet.py +8 -8
  1310. transformers/models/upernet/configuration_upernet.py +1 -0
  1311. transformers/models/upernet/modeling_upernet.py +13 -11
  1312. transformers/models/vaultgemma/__init__.py +1 -0
  1313. transformers/models/vaultgemma/configuration_vaultgemma.py +33 -29
  1314. transformers/models/vaultgemma/modeling_vaultgemma.py +41 -39
  1315. transformers/models/vaultgemma/modular_vaultgemma.py +31 -29
  1316. transformers/models/video_llama_3/configuration_video_llama_3.py +0 -4
  1317. transformers/models/video_llama_3/image_processing_video_llama_3.py +42 -43
  1318. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +14 -12
  1319. transformers/models/video_llama_3/modeling_video_llama_3.py +109 -157
  1320. transformers/models/video_llama_3/modular_video_llama_3.py +146 -155
  1321. transformers/models/video_llama_3/processing_video_llama_3.py +39 -5
  1322. transformers/models/video_llama_3/video_processing_video_llama_3.py +23 -42
  1323. transformers/models/video_llava/configuration_video_llava.py +1 -4
  1324. transformers/models/video_llava/image_processing_video_llava.py +38 -35
  1325. transformers/models/video_llava/modeling_video_llava.py +146 -146
  1326. transformers/models/video_llava/processing_video_llava.py +78 -38
  1327. transformers/models/video_llava/video_processing_video_llava.py +1 -0
  1328. transformers/models/videomae/configuration_videomae.py +1 -0
  1329. transformers/models/videomae/image_processing_videomae.py +34 -31
  1330. transformers/models/videomae/modeling_videomae.py +17 -14
  1331. transformers/models/videomae/video_processing_videomae.py +1 -0
  1332. transformers/models/vilt/configuration_vilt.py +4 -6
  1333. transformers/models/vilt/image_processing_vilt.py +30 -29
  1334. transformers/models/vilt/image_processing_vilt_fast.py +16 -15
  1335. transformers/models/vilt/modeling_vilt.py +90 -116
  1336. transformers/models/vilt/processing_vilt.py +14 -2
  1337. transformers/models/vipllava/configuration_vipllava.py +1 -4
  1338. transformers/models/vipllava/modeling_vipllava.py +70 -99
  1339. transformers/models/vipllava/modular_vipllava.py +54 -78
  1340. transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +1 -0
  1341. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +27 -28
  1342. transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +1 -0
  1343. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +41 -46
  1344. transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +16 -2
  1345. transformers/models/visual_bert/configuration_visual_bert.py +2 -6
  1346. transformers/models/visual_bert/modeling_visual_bert.py +92 -98
  1347. transformers/models/vit/configuration_vit.py +1 -0
  1348. transformers/models/vit/image_processing_vit.py +22 -19
  1349. transformers/models/vit/image_processing_vit_fast.py +1 -0
  1350. transformers/models/vit/modeling_vit.py +17 -17
  1351. transformers/models/vit_mae/configuration_vit_mae.py +1 -0
  1352. transformers/models/vit_mae/modeling_vit_mae.py +27 -29
  1353. transformers/models/vit_msn/configuration_vit_msn.py +1 -0
  1354. transformers/models/vit_msn/modeling_vit_msn.py +16 -18
  1355. transformers/models/vitdet/configuration_vitdet.py +1 -0
  1356. transformers/models/vitdet/modeling_vitdet.py +14 -14
  1357. transformers/models/vitmatte/configuration_vitmatte.py +5 -2
  1358. transformers/models/vitmatte/image_processing_vitmatte.py +18 -15
  1359. transformers/models/vitmatte/image_processing_vitmatte_fast.py +18 -16
  1360. transformers/models/vitmatte/modeling_vitmatte.py +11 -14
  1361. transformers/models/vitpose/configuration_vitpose.py +7 -4
  1362. transformers/models/vitpose/image_processing_vitpose.py +25 -24
  1363. transformers/models/vitpose/image_processing_vitpose_fast.py +11 -9
  1364. transformers/models/vitpose/modeling_vitpose.py +14 -14
  1365. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +1 -0
  1366. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +10 -8
  1367. transformers/models/vits/configuration_vits.py +1 -4
  1368. transformers/models/vits/modeling_vits.py +42 -44
  1369. transformers/models/vits/tokenization_vits.py +4 -3
  1370. transformers/models/vivit/configuration_vivit.py +1 -0
  1371. transformers/models/vivit/image_processing_vivit.py +39 -36
  1372. transformers/models/vivit/modeling_vivit.py +8 -6
  1373. transformers/models/vjepa2/__init__.py +1 -0
  1374. transformers/models/vjepa2/configuration_vjepa2.py +1 -0
  1375. transformers/models/vjepa2/modeling_vjepa2.py +32 -31
  1376. transformers/models/vjepa2/video_processing_vjepa2.py +1 -0
  1377. transformers/models/voxtral/__init__.py +1 -0
  1378. transformers/models/voxtral/configuration_voxtral.py +2 -0
  1379. transformers/models/voxtral/modeling_voxtral.py +47 -40
  1380. transformers/models/voxtral/modular_voxtral.py +40 -37
  1381. transformers/models/voxtral/processing_voxtral.py +48 -25
  1382. transformers/models/wav2vec2/configuration_wav2vec2.py +2 -4
  1383. transformers/models/wav2vec2/feature_extraction_wav2vec2.py +10 -7
  1384. transformers/models/wav2vec2/modeling_wav2vec2.py +121 -73
  1385. transformers/models/wav2vec2/processing_wav2vec2.py +35 -6
  1386. transformers/models/wav2vec2/tokenization_wav2vec2.py +332 -20
  1387. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +2 -4
  1388. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +62 -70
  1389. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +48 -57
  1390. transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +35 -6
  1391. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +2 -4
  1392. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +77 -90
  1393. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +30 -37
  1394. transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +17 -16
  1395. transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +55 -36
  1396. transformers/models/wavlm/configuration_wavlm.py +2 -4
  1397. transformers/models/wavlm/modeling_wavlm.py +48 -50
  1398. transformers/models/wavlm/modular_wavlm.py +5 -4
  1399. transformers/models/whisper/configuration_whisper.py +5 -6
  1400. transformers/models/whisper/english_normalizer.py +4 -3
  1401. transformers/models/whisper/feature_extraction_whisper.py +24 -9
  1402. transformers/models/whisper/generation_whisper.py +48 -26
  1403. transformers/models/whisper/modeling_whisper.py +73 -79
  1404. transformers/models/whisper/processing_whisper.py +20 -3
  1405. transformers/models/whisper/tokenization_whisper.py +43 -11
  1406. transformers/models/x_clip/configuration_x_clip.py +2 -4
  1407. transformers/models/x_clip/modeling_x_clip.py +93 -96
  1408. transformers/models/x_clip/processing_x_clip.py +14 -2
  1409. transformers/models/xcodec/configuration_xcodec.py +6 -4
  1410. transformers/models/xcodec/modeling_xcodec.py +17 -20
  1411. transformers/models/xglm/configuration_xglm.py +8 -9
  1412. transformers/models/xglm/modeling_xglm.py +55 -60
  1413. transformers/models/xglm/tokenization_xglm.py +11 -3
  1414. transformers/models/xlm/configuration_xlm.py +8 -10
  1415. transformers/models/xlm/modeling_xlm.py +144 -144
  1416. transformers/models/xlm/tokenization_xlm.py +5 -3
  1417. transformers/models/xlm_roberta/configuration_xlm_roberta.py +3 -11
  1418. transformers/models/xlm_roberta/modeling_xlm_roberta.py +194 -195
  1419. transformers/models/xlm_roberta/modular_xlm_roberta.py +53 -50
  1420. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +18 -8
  1421. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +2 -10
  1422. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +93 -94
  1423. transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +70 -67
  1424. transformers/models/xlnet/configuration_xlnet.py +12 -3
  1425. transformers/models/xlnet/modeling_xlnet.py +163 -152
  1426. transformers/models/xlnet/tokenization_xlnet.py +9 -2
  1427. transformers/models/xlstm/configuration_xlstm.py +12 -8
  1428. transformers/models/xlstm/modeling_xlstm.py +65 -62
  1429. transformers/models/xmod/configuration_xmod.py +3 -11
  1430. transformers/models/xmod/modeling_xmod.py +110 -108
  1431. transformers/models/yolos/configuration_yolos.py +1 -0
  1432. transformers/models/yolos/image_processing_yolos.py +62 -60
  1433. transformers/models/yolos/image_processing_yolos_fast.py +45 -42
  1434. transformers/models/yolos/modeling_yolos.py +16 -16
  1435. transformers/models/yolos/modular_yolos.py +19 -17
  1436. transformers/models/yoso/configuration_yoso.py +2 -8
  1437. transformers/models/yoso/modeling_yoso.py +63 -70
  1438. transformers/models/zamba/configuration_zamba.py +8 -5
  1439. transformers/models/zamba/modeling_zamba.py +78 -81
  1440. transformers/models/zamba2/configuration_zamba2.py +50 -44
  1441. transformers/models/zamba2/modeling_zamba2.py +97 -97
  1442. transformers/models/zamba2/modular_zamba2.py +48 -46
  1443. transformers/models/zoedepth/configuration_zoedepth.py +2 -1
  1444. transformers/models/zoedepth/image_processing_zoedepth.py +29 -28
  1445. transformers/models/zoedepth/image_processing_zoedepth_fast.py +24 -21
  1446. transformers/models/zoedepth/modeling_zoedepth.py +18 -26
  1447. transformers/pipelines/__init__.py +114 -57
  1448. transformers/pipelines/any_to_any.py +22 -14
  1449. transformers/pipelines/audio_utils.py +2 -1
  1450. transformers/pipelines/automatic_speech_recognition.py +12 -20
  1451. transformers/pipelines/base.py +27 -15
  1452. transformers/{models/pe_audio/processing_pe_audio.py → pipelines/deprecated/__init__.py} +3 -10
  1453. transformers/pipelines/deprecated/text2text_generation.py +408 -0
  1454. transformers/pipelines/document_question_answering.py +2 -4
  1455. transformers/pipelines/image_text_to_text.py +1 -0
  1456. transformers/pipelines/image_to_text.py +229 -0
  1457. transformers/pipelines/question_answering.py +44 -5
  1458. transformers/pipelines/text_classification.py +14 -1
  1459. transformers/pipelines/text_generation.py +1 -1
  1460. transformers/pipelines/text_to_audio.py +2 -2
  1461. transformers/pipelines/token_classification.py +22 -1
  1462. transformers/pipelines/video_classification.py +9 -1
  1463. transformers/pipelines/zero_shot_audio_classification.py +1 -0
  1464. transformers/pipelines/zero_shot_classification.py +6 -0
  1465. transformers/pipelines/zero_shot_image_classification.py +7 -0
  1466. transformers/processing_utils.py +145 -230
  1467. transformers/quantizers/auto.py +4 -2
  1468. transformers/quantizers/base.py +173 -53
  1469. transformers/quantizers/quantizer_aqlm.py +23 -2
  1470. transformers/quantizers/quantizer_auto_round.py +12 -2
  1471. transformers/quantizers/quantizer_awq.py +89 -20
  1472. transformers/quantizers/quantizer_bitnet.py +14 -4
  1473. transformers/quantizers/quantizer_bnb_4bit.py +155 -18
  1474. transformers/quantizers/quantizer_bnb_8bit.py +110 -24
  1475. transformers/quantizers/quantizer_compressed_tensors.py +9 -2
  1476. transformers/quantizers/quantizer_eetq.py +74 -16
  1477. transformers/quantizers/quantizer_fbgemm_fp8.py +138 -38
  1478. transformers/quantizers/quantizer_finegrained_fp8.py +113 -26
  1479. transformers/quantizers/quantizer_fp_quant.py +82 -52
  1480. transformers/quantizers/quantizer_gptq.py +28 -8
  1481. transformers/quantizers/quantizer_higgs.py +60 -42
  1482. transformers/quantizers/quantizer_hqq.py +153 -144
  1483. transformers/quantizers/quantizer_mxfp4.py +194 -14
  1484. transformers/quantizers/quantizer_quanto.py +79 -35
  1485. transformers/quantizers/quantizer_quark.py +18 -36
  1486. transformers/quantizers/quantizer_spqr.py +12 -4
  1487. transformers/quantizers/quantizer_torchao.py +325 -50
  1488. transformers/quantizers/quantizer_vptq.py +27 -4
  1489. transformers/quantizers/quantizers_utils.py +0 -20
  1490. transformers/safetensors_conversion.py +3 -9
  1491. transformers/testing_utils.py +82 -326
  1492. transformers/tokenization_mistral_common.py +903 -568
  1493. transformers/tokenization_utils_base.py +340 -220
  1494. transformers/tokenization_utils_sentencepiece.py +6 -5
  1495. transformers/tokenization_utils_tokenizers.py +113 -226
  1496. transformers/trainer.py +53 -60
  1497. transformers/trainer_callback.py +0 -8
  1498. transformers/trainer_seq2seq.py +1 -5
  1499. transformers/trainer_utils.py +1 -1
  1500. transformers/training_args.py +41 -77
  1501. transformers/utils/__init__.py +4 -8
  1502. transformers/utils/attention_visualizer.py +5 -5
  1503. transformers/utils/auto_docstring.py +37 -599
  1504. transformers/utils/doc.py +36 -4
  1505. transformers/utils/dummy_pt_objects.py +42 -0
  1506. transformers/utils/generic.py +28 -111
  1507. transformers/utils/hub.py +15 -5
  1508. transformers/utils/import_utils.py +32 -165
  1509. transformers/utils/kernel_config.py +19 -74
  1510. transformers/utils/loading_report.py +15 -25
  1511. transformers/utils/quantization_config.py +241 -72
  1512. transformers/video_processing_utils.py +39 -41
  1513. transformers/video_utils.py +22 -18
  1514. {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/METADATA +236 -284
  1515. transformers-5.0.0rc0.dist-info/RECORD +1987 -0
  1516. {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/WHEEL +1 -1
  1517. transformers/integrations/moe.py +0 -360
  1518. transformers/integrations/quark.py +0 -53
  1519. transformers/loss/loss_lw_detr.py +0 -356
  1520. transformers/models/ernie4_5_vl_moe/__init__.py +0 -31
  1521. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +0 -340
  1522. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +0 -455
  1523. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +0 -231
  1524. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +0 -1936
  1525. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +0 -1925
  1526. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +0 -249
  1527. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +0 -593
  1528. transformers/models/fast_vlm/__init__.py +0 -27
  1529. transformers/models/fast_vlm/configuration_fast_vlm.py +0 -137
  1530. transformers/models/fast_vlm/modeling_fast_vlm.py +0 -432
  1531. transformers/models/fast_vlm/modular_fast_vlm.py +0 -373
  1532. transformers/models/glm4_moe_lite/__init__.py +0 -28
  1533. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +0 -233
  1534. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +0 -740
  1535. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +0 -302
  1536. transformers/models/glm_image/__init__.py +0 -31
  1537. transformers/models/glm_image/configuration_glm_image.py +0 -351
  1538. transformers/models/glm_image/image_processing_glm_image.py +0 -503
  1539. transformers/models/glm_image/image_processing_glm_image_fast.py +0 -294
  1540. transformers/models/glm_image/modeling_glm_image.py +0 -1642
  1541. transformers/models/glm_image/modular_glm_image.py +0 -1531
  1542. transformers/models/glm_image/processing_glm_image.py +0 -217
  1543. transformers/models/glmasr/__init__.py +0 -29
  1544. transformers/models/glmasr/configuration_glmasr.py +0 -196
  1545. transformers/models/glmasr/modeling_glmasr.py +0 -517
  1546. transformers/models/glmasr/modular_glmasr.py +0 -443
  1547. transformers/models/glmasr/processing_glmasr.py +0 -331
  1548. transformers/models/jais2/__init__.py +0 -27
  1549. transformers/models/jais2/configuration_jais2.py +0 -148
  1550. transformers/models/jais2/modeling_jais2.py +0 -484
  1551. transformers/models/jais2/modular_jais2.py +0 -194
  1552. transformers/models/lasr/__init__.py +0 -29
  1553. transformers/models/lasr/configuration_lasr.py +0 -244
  1554. transformers/models/lasr/feature_extraction_lasr.py +0 -275
  1555. transformers/models/lasr/modeling_lasr.py +0 -727
  1556. transformers/models/lasr/modular_lasr.py +0 -574
  1557. transformers/models/lasr/processing_lasr.py +0 -100
  1558. transformers/models/lasr/tokenization_lasr.py +0 -184
  1559. transformers/models/lighton_ocr/__init__.py +0 -28
  1560. transformers/models/lighton_ocr/configuration_lighton_ocr.py +0 -128
  1561. transformers/models/lighton_ocr/modeling_lighton_ocr.py +0 -463
  1562. transformers/models/lighton_ocr/modular_lighton_ocr.py +0 -404
  1563. transformers/models/lighton_ocr/processing_lighton_ocr.py +0 -229
  1564. transformers/models/lw_detr/__init__.py +0 -27
  1565. transformers/models/lw_detr/configuration_lw_detr.py +0 -374
  1566. transformers/models/lw_detr/modeling_lw_detr.py +0 -1702
  1567. transformers/models/lw_detr/modular_lw_detr.py +0 -1615
  1568. transformers/models/minimax_m2/__init__.py +0 -28
  1569. transformers/models/minimax_m2/configuration_minimax_m2.py +0 -188
  1570. transformers/models/minimax_m2/modeling_minimax_m2.py +0 -704
  1571. transformers/models/minimax_m2/modular_minimax_m2.py +0 -346
  1572. transformers/models/paddleocr_vl/__init__.py +0 -31
  1573. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +0 -335
  1574. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +0 -503
  1575. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +0 -209
  1576. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +0 -1683
  1577. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +0 -1380
  1578. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +0 -133
  1579. transformers/models/pe_audio/__init__.py +0 -29
  1580. transformers/models/pe_audio/configuration_pe_audio.py +0 -204
  1581. transformers/models/pe_audio/feature_extraction_pe_audio.py +0 -160
  1582. transformers/models/pe_audio/modeling_pe_audio.py +0 -819
  1583. transformers/models/pe_audio/modular_pe_audio.py +0 -298
  1584. transformers/models/pe_audio_video/__init__.py +0 -28
  1585. transformers/models/pe_audio_video/configuration_pe_audio_video.py +0 -223
  1586. transformers/models/pe_audio_video/modeling_pe_audio_video.py +0 -971
  1587. transformers/models/pe_audio_video/modular_pe_audio_video.py +0 -763
  1588. transformers/models/pe_video/__init__.py +0 -29
  1589. transformers/models/pe_video/configuration_pe_video.py +0 -209
  1590. transformers/models/pe_video/modeling_pe_video.py +0 -647
  1591. transformers/models/pe_video/modular_pe_video.py +0 -231
  1592. transformers/models/pe_video/processing_pe_video.py +0 -10
  1593. transformers/models/pe_video/video_processing_pe_video.py +0 -64
  1594. transformers/models/pixio/__init__.py +0 -29
  1595. transformers/models/pixio/configuration_pixio.py +0 -150
  1596. transformers/models/pixio/modeling_pixio.py +0 -507
  1597. transformers/models/pixio/modular_pixio.py +0 -403
  1598. transformers/models/solar_open/__init__.py +0 -27
  1599. transformers/models/solar_open/configuration_solar_open.py +0 -184
  1600. transformers/models/solar_open/modeling_solar_open.py +0 -642
  1601. transformers/models/solar_open/modular_solar_open.py +0 -224
  1602. transformers/trainer_jit_checkpoint.py +0 -125
  1603. transformers-5.0.0.dist-info/RECORD +0 -2068
  1604. {transformers-5.0.0.dist-info/licenses → transformers-5.0.0rc0.dist-info}/LICENSE +0 -0
  1605. {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/entry_points.txt +0 -0
  1606. {transformers-5.0.0.dist-info → transformers-5.0.0rc0.dist-info}/top_level.txt +0 -0
@@ -1,1615 +0,0 @@
1
- # Copyright 2026 The HuggingFace Inc. team. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- import math
15
- from collections.abc import Callable
16
- from dataclasses import dataclass
17
- from typing import Any
18
-
19
- import torch
20
- from torch import nn
21
-
22
- from ... import initialization as init
23
- from ...activations import ACT2FN
24
- from ...configuration_utils import PreTrainedConfig
25
- from ...modeling_layers import GradientCheckpointingLayer
26
- from ...modeling_outputs import BackboneOutput
27
- from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
28
- from ...processing_utils import Unpack
29
- from ...pytorch_utils import meshgrid
30
- from ...utils import ModelOutput, TransformersKwargs, auto_docstring, logging
31
- from ...utils.generic import check_model_inputs
32
- from ..auto.configuration_auto import AutoConfig
33
- from ..convnext.modeling_convnext import ConvNextLayerNorm
34
- from ..dab_detr.modeling_dab_detr import gen_sine_position_embeddings
35
- from ..deformable_detr.modeling_deformable_detr import (
36
- DeformableDetrDecoderOutput,
37
- DeformableDetrForObjectDetection,
38
- DeformableDetrMLPPredictionHead,
39
- DeformableDetrModel,
40
- DeformableDetrMultiscaleDeformableAttention,
41
- )
42
- from ..llama.modeling_llama import eager_attention_forward
43
- from ..rt_detr.configuration_rt_detr import CONFIG_MAPPING
44
- from ..rt_detr.modeling_rt_detr import RTDetrConvNormLayer
45
- from ..vit.modeling_vit import ViTAttention, ViTEncoder, ViTSelfAttention
46
- from ..vitdet.configuration_vitdet import VitDetConfig
47
- from ..vitdet.modeling_vitdet import (
48
- VitDetBackbone,
49
- VitDetEmbeddings,
50
- VitDetMlp,
51
- VitDetPreTrainedModel,
52
- )
53
-
54
-
55
- logger = logging.get_logger(__name__)
56
-
57
-
58
- class LwDetrViTConfig(VitDetConfig):
59
- r"""
60
- This is the configuration class to store the configuration of a [`LwDetrViTModel`]. It is used to instantiate an
61
- LW-DETR ViT model according to the specified arguments, defining the model architecture. Instantiating a configuration
62
- with the defaults will yield a similar configuration to that of the LW-DETR ViT
63
- [AnnaZhang/lwdetr_small_60e_coco](https://huggingface.co/AnnaZhang/lwdetr_small_60e_coco) architecture.
64
-
65
- LW-DETR ViT is the Vision Transformer backbone used in the LW-DETR model for real-time object detection. It features
66
- interleaved window and global attention mechanisms to reduce computational complexity while maintaining high performance.
67
- The model uses a window-major feature map organization for efficient attention computation.
68
-
69
- Configuration objects inherit from [`VitDetConfig`] and can be used to control the model outputs. Read the
70
- documentation from [`VitDetConfig`] for more information.
71
-
72
- Args:
73
- hidden_size (`int`, *optional*, defaults to 768):
74
- Dimensionality of the encoder layers and the pooler layer.
75
- num_hidden_layers (`int`, *optional*, defaults to 12):
76
- Number of hidden layers in the Transformer encoder.
77
- num_attention_heads (`int`, *optional*, defaults to 12):
78
- Number of attention heads for each attention layer in the Transformer encoder.
79
- mlp_ratio (`int`, *optional*, defaults to 4):
80
- Ratio of mlp hidden dim to embedding dim.
81
- hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
82
- The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
83
- `"relu"`, `"selu"` and `"gelu_new"` are supported.
84
- dropout_prob (`float`, *optional*, defaults to 0.0):
85
- The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
86
- initializer_range (`float`, *optional*, defaults to 0.02):
87
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
88
- layer_norm_eps (`float`, *optional*, defaults to 1e-06):
89
- The epsilon used by the layer normalization layers.
90
- image_size (`int`, *optional*, defaults to 256):
91
- The size (resolution) of each image.
92
- pretrain_image_size (`int`, *optional*, defaults to 224):
93
- The size (resolution) of each image during pretraining.
94
- patch_size (`int`, *optional*, defaults to 16):
95
- The size (resolution) of each patch.
96
- num_channels (`int`, *optional*, defaults to 3):
97
- The number of input channels.
98
- qkv_bias (`bool`, *optional*, defaults to `True`):
99
- Whether to add a bias to the queries, keys and values.
100
- window_block_indices (`list[int]`, *optional*, defaults to `[]`):
101
- List of indices of blocks that should have window attention instead of regular global self-attention.
102
- use_absolute_position_embeddings (`bool`, *optional*, defaults to `True`):
103
- Whether to add absolute position embeddings to the patch embeddings.
104
- out_features (`list[str]`, *optional*):
105
- If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
106
- (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
107
- corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
108
- same order as defined in the `stage_names` attribute.
109
- out_indices (`list[int]`, *optional*):
110
- If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
111
- many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
112
- If unset and `out_features` is unset, will default to the last stage. Must be in the
113
- same order as defined in the `stage_names` attribute.
114
- cae_init_values (`float`, *optional*, defaults to 0.1):
115
- Initialization value for CAE parameters when `use_cae` is enabled.
116
- num_windows (`int`, *optional*, defaults to 16):
117
- Number of windows for window-based attention. Must be a perfect square and the image size must be
118
- divisible by the square root of this value. This enables efficient window-major feature map organization.
119
-
120
- Example:
121
-
122
- ```python
123
- >>> from transformers import LwDetrViTConfig, LwDetrViTModel
124
-
125
- >>> # Initializing a LW-DETR ViT configuration
126
- >>> configuration = LwDetrViTConfig()
127
-
128
- >>> # Initializing a model (with random weights) from the configuration
129
- >>> model = LwDetrViTModel(configuration)
130
-
131
- >>> # Accessing the model configuration
132
- >>> configuration = model.config
133
- ```"""
134
-
135
- model_type = "lw_detr_vit"
136
-
137
- def __init__(
138
- self,
139
- hidden_size=768,
140
- num_hidden_layers=12,
141
- num_attention_heads=12,
142
- mlp_ratio=4,
143
- hidden_act="gelu",
144
- dropout_prob=0.0,
145
- initializer_range=0.02,
146
- layer_norm_eps=1e-6,
147
- image_size=256,
148
- pretrain_image_size=224,
149
- patch_size=16,
150
- num_channels=3,
151
- qkv_bias=True,
152
- window_block_indices=[],
153
- use_absolute_position_embeddings=True,
154
- out_features=None,
155
- out_indices=None,
156
- cae_init_values: float = 0.1,
157
- num_windows=16,
158
- **kwargs,
159
- ):
160
- super().__init__(
161
- hidden_size=hidden_size,
162
- num_hidden_layers=num_hidden_layers,
163
- num_attention_heads=num_attention_heads,
164
- mlp_ratio=mlp_ratio,
165
- hidden_act=hidden_act,
166
- dropout_prob=dropout_prob,
167
- initializer_range=initializer_range,
168
- layer_norm_eps=layer_norm_eps,
169
- image_size=image_size,
170
- pretrain_image_size=pretrain_image_size,
171
- patch_size=patch_size,
172
- num_channels=num_channels,
173
- qkv_bias=qkv_bias,
174
- window_block_indices=window_block_indices,
175
- use_absolute_position_embeddings=use_absolute_position_embeddings,
176
- out_features=out_features,
177
- out_indices=out_indices,
178
- **kwargs,
179
- )
180
- del self.residual_block_indices
181
- del self.use_relative_position_embeddings
182
- del self.window_size
183
- del self.drop_path_rate
184
-
185
- self.cae_init_values = cae_init_values
186
- if num_windows % math.sqrt(num_windows) != 0:
187
- raise ValueError(
188
- f"`num_windows` has to be a perfect square, where num_windows % math.sqrt(num_windows) != 0, but got {num_windows}."
189
- )
190
- if image_size / num_windows % math.sqrt(num_windows) != 0:
191
- raise ValueError(
192
- f"`image_size` has to be divisible by `num_windows`, where image_size / num_windows % math.sqrt(num_windows) != 0,but got {image_size} and {num_windows}."
193
- )
194
- self.num_windows = num_windows
195
- self.num_windows_side = int(math.sqrt(num_windows))
196
-
197
-
198
- class LwDetrConfig(PreTrainedConfig):
199
- r"""
200
- This is the configuration class to store the configuration of a [`LwDetrModel`]. It is used to instantiate
201
- a LW-DETR model according to the specified arguments, defining the model architecture. Instantiating a
202
- configuration with the defaults will yield a similar configuration to that of the LW-DETR
203
- [AnnaZhang/lwdetr_small_60e_coco](https://huggingface.co/AnnaZhang/lwdetr_small_60e_coco) architecture.
204
-
205
- LW-DETR (Lightweight Detection Transformer) is a transformer-based object detection model designed for real-time
206
- detection tasks. It replaces traditional CNN-based detectors like YOLO with a more efficient transformer architecture
207
- that achieves competitive performance while being computationally lightweight.
208
-
209
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
210
- documentation from [`PretrainedConfig`] for more information.
211
-
212
- Args:
213
- backbone_config (`PretrainedConfig` or `dict`, *optional*):
214
- The configuration of the backbone model. If not provided, will default to `LwDetrViTConfig` with
215
- a small ViT architecture optimized for detection tasks.
216
- projector_scale_factors (`list[float]`, *optional*, defaults to `[]`):
217
- Scale factors for the feature pyramid network. Each scale factor determines the resolution of features
218
- at different levels. Supported values are 0.5, 1.0, and 2.0.
219
- hidden_expansion (`float`, *optional*, defaults to 0.5):
220
- Expansion factor for hidden dimensions in the projector layers.
221
- c2f_num_blocks (`int`, *optional*, defaults to 3):
222
- Number of blocks in the C2F layer.
223
- activation_function (`str`, *optional*, defaults to `"silu"`):
224
- The non-linear activation function in the projector. Supported values are `"silu"`, `"relu"`, `"gelu"`.
225
- batch_norm_eps (`float`, *optional*, defaults to 1e-05):
226
- The epsilon value for batch normalization layers.
227
- d_model (`int`, *optional*, defaults to 256):
228
- Dimension of the model layers and the number of expected features in the decoder inputs.
229
- dropout (`float`, *optional*, defaults to 0.1):
230
- The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
231
- decoder_ffn_dim (`int`, *optional*, defaults to 2048):
232
- Dimension of the "intermediate" (often named feed-forward) layer in decoder.
233
- decoder_n_points (`int`, *optional*, defaults to 4):
234
- The number of sampled keys in each feature level for each attention head in the decoder.
235
- decoder_layers (`int`, *optional*, defaults to 3):
236
- Number of decoder layers in the transformer.
237
- decoder_self_attention_heads (`int`, *optional*, defaults to 8):
238
- Number of attention heads for each attention layer in the decoder self-attention.
239
- decoder_cross_attention_heads (`int`, *optional*, defaults to 16):
240
- Number of attention heads for each attention layer in the decoder cross-attention.
241
- decoder_activation_function (`str`, *optional*, defaults to `"relu"`):
242
- The non-linear activation function in the decoder. Supported values are `"relu"`, `"silu"`, `"gelu"`.
243
- num_queries (`int`, *optional*, defaults to 300):
244
- Number of object queries, i.e. detection slots. This is the maximal number of objects
245
- [`LwDetrModel`] can detect in a single image.
246
- attention_bias (`bool`, *optional*, defaults to `True`):
247
- Whether to add bias to the attention layers.
248
- attention_dropout (`float`, *optional*, defaults to 0.0):
249
- The dropout ratio for the attention probabilities.
250
- activation_dropout (`float`, *optional*, defaults to 0.0):
251
- The dropout ratio for activations inside the fully connected layer.
252
- group_detr (`int`, *optional*, defaults to 13):
253
- Number of groups for Group DETR attention mechanism, which helps reduce computational complexity.
254
- init_std (`float`, *optional*, defaults to 0.02):
255
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
256
- disable_custom_kernels (`bool`, *optional*, defaults to `True`):
257
- Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
258
- kernels are not supported by PyTorch ONNX export.
259
- class_cost (`float`, *optional*, defaults to 2):
260
- Relative weight of the classification error in the Hungarian matching cost.
261
- bbox_cost (`float`, *optional*, defaults to 5):
262
- Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
263
- giou_cost (`float`, *optional*, defaults to 2):
264
- Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
265
- mask_loss_coefficient (`float`, *optional*, defaults to 1):
266
- Relative weight of the Focal loss in the panoptic segmentation loss.
267
- dice_loss_coefficient (`float`, *optional*, defaults to 1):
268
- Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
269
- bbox_loss_coefficient (`float`, *optional*, defaults to 5):
270
- Relative weight of the L1 bounding box loss in the object detection loss.
271
- giou_loss_coefficient (`float`, *optional*, defaults to 2):
272
- Relative weight of the generalized IoU loss in the object detection loss.
273
- eos_coefficient (`float`, *optional*, defaults to 0.1):
274
- Relative classification weight of the 'no-object' class in the object detection loss.
275
- focal_alpha (`float`, *optional*, defaults to 0.25):
276
- Alpha parameter in the focal loss.
277
- auxiliary_loss (`bool`, *optional*, defaults to `True`):
278
- Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
279
-
280
- Examples:
281
-
282
- ```python
283
- >>> from transformers import LwDetrConfig, LwDetrModel
284
-
285
- >>> # Initializing a LW-DETR AnnaZhang/lwdetr_small_60e_coco style configuration
286
- >>> configuration = LwDetrConfig()
287
-
288
- >>> # Initializing a model (with random weights) from the AnnaZhang/lwdetr_small_60e_coco style configuration
289
- >>> model = LwDetrModel(configuration)
290
-
291
- >>> # Accessing the model configuration
292
- >>> configuration = model.config
293
- ```"""
294
-
295
- model_type = "lw_detr"
296
- sub_configs = {"backbone_config": AutoConfig}
297
-
298
- def __init__(
299
- self,
300
- # backbone
301
- backbone_config=None,
302
- # projector
303
- projector_scale_factors: list[float] = [],
304
- hidden_expansion=0.5,
305
- c2f_num_blocks=3,
306
- activation_function="silu",
307
- batch_norm_eps=1e-5,
308
- # decoder
309
- d_model=256,
310
- dropout=0.1,
311
- decoder_ffn_dim=2048,
312
- decoder_n_points=4,
313
- decoder_layers: int = 3,
314
- decoder_self_attention_heads: int = 8,
315
- decoder_cross_attention_heads: int = 16,
316
- decoder_activation_function="relu",
317
- # model
318
- num_queries=300,
319
- attention_bias=True,
320
- attention_dropout=0.0,
321
- activation_dropout=0.0,
322
- group_detr: int = 13,
323
- init_std=0.02,
324
- disable_custom_kernels=True,
325
- # loss
326
- class_cost=2,
327
- bbox_cost=5,
328
- giou_cost=2,
329
- mask_loss_coefficient=1,
330
- dice_loss_coefficient=1,
331
- bbox_loss_coefficient=5,
332
- giou_loss_coefficient=2,
333
- eos_coefficient=0.1,
334
- focal_alpha=0.25,
335
- auxiliary_loss=True,
336
- **kwargs,
337
- ):
338
- self.batch_norm_eps = batch_norm_eps
339
-
340
- # backbone
341
- if backbone_config is None:
342
- logger.info(
343
- "`backbone_config` and `backbone` are `None`. Initializing the config with the default `LwDetrViT` backbone."
344
- )
345
- backbone_config = LwDetrViTConfig(
346
- image_size=1024,
347
- hidden_size=192,
348
- num_hidden_layers=10,
349
- num_attention_heads=12,
350
- window_block_indices=[0, 1, 3, 6, 7, 9],
351
- out_indices=[2, 4, 5, 9],
352
- **kwargs,
353
- )
354
- elif isinstance(backbone_config, dict):
355
- backbone_model_type = backbone_config.pop("model_type")
356
- config_class = CONFIG_MAPPING[backbone_model_type]
357
- backbone_config = config_class.from_dict(backbone_config)
358
-
359
- self.backbone_config = backbone_config
360
- # projector
361
- self.projector_scale_factors = projector_scale_factors
362
- for scale in projector_scale_factors:
363
- if scale not in [0.5, 1.0, 2.0]:
364
- raise ValueError(f"Unsupported scale factor: {scale}")
365
- self.projector_in_channels = [d_model] * len(projector_scale_factors)
366
- self.projector_out_channels = d_model
367
- self.activation_function = activation_function
368
- self.hidden_expansion = hidden_expansion
369
- self.c2f_num_blocks = c2f_num_blocks
370
- # decoder
371
- self.d_model = d_model
372
- self.dropout = dropout
373
- self.num_queries = num_queries
374
- self.decoder_ffn_dim = decoder_ffn_dim
375
- self.num_feature_levels = len(self.projector_scale_factors)
376
- self.decoder_n_points = decoder_n_points
377
- self.decoder_layers = decoder_layers
378
- self.decoder_activation_function = decoder_activation_function
379
- self.decoder_self_attention_heads = decoder_self_attention_heads
380
- self.decoder_cross_attention_heads = decoder_cross_attention_heads
381
- self.attention_bias = attention_bias
382
- self.attention_dropout = attention_dropout
383
- self.activation_dropout = activation_dropout
384
- # model
385
- self.init_std = init_std
386
- self.group_detr = group_detr
387
- # Loss
388
- self.auxiliary_loss = auxiliary_loss
389
- # Hungarian matcher
390
- self.class_cost = class_cost
391
- self.bbox_cost = bbox_cost
392
- self.giou_cost = giou_cost
393
- # Loss coefficients
394
- self.dice_loss_coefficient = dice_loss_coefficient
395
- self.bbox_loss_coefficient = bbox_loss_coefficient
396
- self.giou_loss_coefficient = giou_loss_coefficient
397
- self.eos_coefficient = eos_coefficient
398
- self.focal_alpha = focal_alpha
399
- self.disable_custom_kernels = disable_custom_kernels
400
- super().__init__(**kwargs)
401
-
402
-
403
- class LwDetrViTSelfAttention(ViTSelfAttention):
404
- def __init__(self, config: LwDetrViTConfig):
405
- super().__init__(config)
406
- del self.key
407
- self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
408
- self.num_key_value_groups = 1
409
- self.dropout_prob = config.dropout_prob
410
-
411
- def forward(
412
- self,
413
- hidden_states: torch.Tensor,
414
- **kwargs: Unpack[TransformersKwargs],
415
- ) -> tuple[torch.Tensor, torch.Tensor]:
416
- batch_size = hidden_states.shape[0]
417
- new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size
418
-
419
- key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2)
420
- value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
421
- query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
422
-
423
- attention_interface: Callable = eager_attention_forward
424
- if self.config._attn_implementation != "eager":
425
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
426
-
427
- context_layer, attention_probs = attention_interface(
428
- self,
429
- query_layer,
430
- key_layer,
431
- value_layer,
432
- None,
433
- is_causal=self.is_causal,
434
- scaling=self.scaling,
435
- dropout=0.0 if not self.training else self.dropout_prob,
436
- **kwargs,
437
- )
438
-
439
- new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
440
- context_layer = context_layer.reshape(new_context_layer_shape)
441
-
442
- return context_layer, attention_probs
443
-
444
-
445
- class LwDetrViTAttention(ViTAttention):
446
- def __init__(self, config: LwDetrViTConfig):
447
- """
448
- Args:
449
- config (`LwDetrViTConfig`):
450
- Model configuration.
451
- """
452
- super().__init__(config)
453
- self.attention = LwDetrViTSelfAttention(config)
454
- self.output = nn.Linear(config.hidden_size, config.hidden_size)
455
-
456
- def forward(
457
- self,
458
- hidden_states: torch.Tensor,
459
- **kwargs: Unpack[TransformersKwargs],
460
- ) -> torch.Tensor:
461
- self_attn_output, _ = self.attention(hidden_states, **kwargs)
462
- output = self.output(self_attn_output)
463
- return output
464
-
465
-
466
- class LwDetrViTMlp(VitDetMlp):
467
- pass
468
-
469
-
470
- class LwDetrViTLayer(GradientCheckpointingLayer):
471
- def __init__(
472
- self,
473
- config: LwDetrViTConfig,
474
- layer_idx,
475
- ) -> None:
476
- super().__init__()
477
-
478
- dim = config.hidden_size
479
- self.attention = LwDetrViTAttention(config)
480
- self.intermediate = LwDetrViTMlp(config=config, in_features=dim, hidden_features=int(dim * config.mlp_ratio))
481
- self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
482
- self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
483
-
484
- self.gamma_1 = nn.Parameter(torch.Tensor(dim), requires_grad=True)
485
- self.gamma_2 = nn.Parameter(torch.Tensor(dim), requires_grad=True)
486
-
487
- self.window = layer_idx in config.window_block_indices
488
- self.num_windows = config.num_windows
489
-
490
- def forward(
491
- self,
492
- hidden_states: torch.Tensor,
493
- **kwargs: Unpack[TransformersKwargs],
494
- ) -> torch.Tensor:
495
- batch_size, seq_len, channels = hidden_states.shape
496
- hidden_states_norm = self.layernorm_before(hidden_states)
497
-
498
- if not self.window:
499
- hidden_states_norm = hidden_states_norm.reshape(
500
- batch_size // self.num_windows, self.num_windows * seq_len, channels
501
- )
502
-
503
- attention_output = self.attention(hidden_states_norm, **kwargs)
504
- attention_output = attention_output * self.gamma_1
505
-
506
- if not self.window:
507
- attention_output = attention_output.reshape(batch_size, seq_len, channels)
508
-
509
- hidden_states = hidden_states + attention_output
510
-
511
- layer_output = self.layernorm_after(hidden_states)
512
- layer_output = self.intermediate(layer_output)
513
- layer_output = layer_output * self.gamma_2
514
-
515
- hidden_states = hidden_states + layer_output
516
-
517
- return hidden_states
518
-
519
-
520
- class LwDetrViTEncoder(ViTEncoder):
521
- def __init__(self, config: LwDetrViTConfig) -> None:
522
- super().__init__(config)
523
- self.layer = nn.ModuleList([LwDetrViTLayer(config, i) for i in range(config.num_hidden_layers)])
524
-
525
- def forward(
526
- self,
527
- hidden_states: torch.Tensor,
528
- **kwargs: Unpack[TransformersKwargs],
529
- ) -> list[torch.Tensor]:
530
- list_hidden_states = [hidden_states]
531
- for i, layer_module in enumerate(self.layer):
532
- hidden_states = layer_module(hidden_states, **kwargs)
533
- list_hidden_states.append(hidden_states)
534
- return list_hidden_states
535
-
536
-
537
- class LwDetrViTEmbeddings(VitDetEmbeddings):
538
- pass
539
-
540
-
541
- class LwDetrViTPreTrainedModel(VitDetPreTrainedModel):
542
- config: LwDetrViTConfig
543
- base_model_prefix = "lw_detr_vit"
544
- main_input_name = "pixel_values"
545
- supports_gradient_checkpointing = True
546
- _no_split_modules = ["LwDetrViTEmbeddings", "LwDetrViTLayer"]
547
- _supports_sdpa = True
548
- _supports_flash_attn = True
549
- _supports_flex_attn = True
550
- _supports_attention_backend = True
551
- _can_record_outputs = {
552
- "hidden_states": LwDetrViTLayer,
553
- "attentions": LwDetrViTSelfAttention,
554
- }
555
-
556
- def _init_weights(self, module) -> None:
557
- """Initialize the weights"""
558
- if isinstance(module, (nn.Linear, nn.Conv2d)):
559
- init.trunc_normal_(module.weight, mean=0.0, std=self.config.initializer_range)
560
- if module.bias is not None:
561
- init.zeros_(module.bias)
562
- elif isinstance(module, nn.LayerNorm):
563
- init.zeros_(module.bias)
564
- init.ones_(module.weight)
565
- elif isinstance(module, LwDetrViTEmbeddings):
566
- init.trunc_normal_(module.position_embeddings, mean=0.0, std=self.config.initializer_range)
567
- if isinstance(module, LwDetrViTLayer):
568
- nn.init.constant_(module.gamma_1, self.config.cae_init_values)
569
- nn.init.constant_(module.gamma_2, self.config.cae_init_values)
570
-
571
-
572
- @auto_docstring()
573
- class LwDetrViTBackbone(VitDetBackbone):
574
- @check_model_inputs
575
- @auto_docstring
576
- def forward(self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BackboneOutput:
577
- r"""
578
- Examples:
579
-
580
- ```python
581
- >>> from transformers import LwDetrViTConfig, LwDetrViTBackbone
582
- >>> import torch
583
-
584
- >>> config = LwDetrViTConfig()
585
- >>> model = LwDetrViTBackbone(config)
586
-
587
- >>> pixel_values = torch.randn(1, 3, 224, 224)
588
-
589
- >>> with torch.no_grad():
590
- ... outputs = model(pixel_values)
591
-
592
- >>> feature_maps = outputs.feature_maps
593
- >>> list(feature_maps[-1].shape)
594
- [1, 768, 14, 14]
595
- ```"""
596
- embedding_output = self.embeddings(pixel_values)
597
-
598
- batch_size, channels, height, width = embedding_output.shape
599
- # (batch_size, channels, height, width) -> (batch_size, height, width, channels)
600
- hidden_states = embedding_output.permute(0, 2, 3, 1)
601
-
602
- window_height = height // self.config.num_windows_side
603
- window_width = width // self.config.num_windows_side
604
- # (batch_size, height, width, channels) -> (batch_size*num_windows_side**2, window_height*window_width, channels)
605
- hidden_states = (
606
- hidden_states.reshape(
607
- batch_size,
608
- self.config.num_windows_side,
609
- window_height,
610
- self.config.num_windows_side,
611
- window_width,
612
- channels,
613
- )
614
- .permute(0, 1, 3, 2, 4, 5)
615
- .reshape(batch_size * self.config.num_windows_side**2, window_height * window_width, channels)
616
- )
617
-
618
- hidden_states = self.encoder(hidden_states, **kwargs)
619
-
620
- feature_maps = ()
621
- for stage, hidden_state in zip(self.stage_names, hidden_states):
622
- if stage in self.out_features:
623
- hidden_state = (
624
- hidden_state.reshape(
625
- batch_size,
626
- self.config.num_windows_side,
627
- self.config.num_windows_side,
628
- window_height,
629
- window_width,
630
- channels,
631
- )
632
- .permute(0, 5, 1, 3, 2, 4)
633
- .reshape(batch_size, channels, height, width)
634
- )
635
- feature_maps += (hidden_state,)
636
-
637
- return BackboneOutput(feature_maps=feature_maps)
638
-
639
-
640
- class LwDetrConvNormLayer(RTDetrConvNormLayer):
641
- def __init__(
642
- self,
643
- config: LwDetrConfig,
644
- in_channels: int,
645
- out_channels: int,
646
- kernel_size: int,
647
- stride: int,
648
- activation: str | None = None,
649
- ):
650
- super().__init__(config, in_channels, out_channels, kernel_size, stride, activation)
651
- self.conv = nn.Conv2d(
652
- in_channels,
653
- out_channels,
654
- kernel_size,
655
- stride,
656
- padding=kernel_size // 2,
657
- bias=False,
658
- )
659
-
660
-
661
- class LwDetrRepVggBlock(nn.Module):
662
- def __init__(self, config: LwDetrConfig):
663
- super().__init__()
664
- hidden_channels = int(config.d_model * config.hidden_expansion)
665
- self.conv1 = LwDetrConvNormLayer(
666
- config, hidden_channels, hidden_channels, 3, 1, activation=config.activation_function
667
- )
668
- self.conv2 = LwDetrConvNormLayer(
669
- config, hidden_channels, hidden_channels, 3, 1, activation=config.activation_function
670
- )
671
-
672
- def forward(self, x: torch.Tensor) -> torch.Tensor:
673
- y = self.conv1(x)
674
- y = self.conv2(y)
675
- return y
676
-
677
-
678
- class LwDetrC2FLayer(nn.Module):
679
- # Inspired by RTDetrCSPRepLayer
680
- def __init__(self, config: LwDetrConfig, in_channels: int):
681
- super().__init__()
682
- num_blocks = config.c2f_num_blocks
683
- activation = config.activation_function
684
- out_channels = config.d_model
685
-
686
- self.hidden_channels = int(out_channels * config.hidden_expansion)
687
-
688
- conv1_out_channels = 2 * self.hidden_channels
689
- self.conv1 = LwDetrConvNormLayer(config, in_channels, conv1_out_channels, 1, 1, activation=activation)
690
-
691
- conv2_in_channels = (2 + num_blocks) * self.hidden_channels
692
- self.conv2 = LwDetrConvNormLayer(config, conv2_in_channels, out_channels, 1, 1, activation=activation)
693
-
694
- self.bottlenecks = nn.ModuleList(LwDetrRepVggBlock(config) for _ in range(num_blocks))
695
-
696
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
697
- hidden_states = self.conv1(hidden_states)
698
- all_hidden_states = list(hidden_states.split(self.hidden_channels, 1))
699
- hidden_states = all_hidden_states[-1]
700
-
701
- for bottleneck in self.bottlenecks:
702
- hidden_states = bottleneck(hidden_states)
703
- all_hidden_states.append(hidden_states)
704
-
705
- hidden_states = torch.cat(all_hidden_states, 1)
706
- hidden_states = self.conv2(hidden_states)
707
- return hidden_states
708
-
709
-
710
- class LwDetrLayerNorm(ConvNextLayerNorm):
711
- pass
712
-
713
-
714
- class LwDetrSamplingLayer(nn.Module):
715
- def __init__(self, config: LwDetrConfig, channel_size: int, scale: float):
716
- super().__init__()
717
-
718
- self.scale = scale
719
- self.channel_size = channel_size
720
-
721
- layers = []
722
- if scale == 2.0:
723
- if channel_size > 512:
724
- layers.append(LwDetrConvNormLayer(config, channel_size, channel_size // 2, 1, 1, activation="relu"))
725
- layers.append(nn.ConvTranspose2d(channel_size // 2, channel_size // 4, kernel_size=2, stride=2))
726
- else:
727
- layers.append(nn.ConvTranspose2d(channel_size, channel_size // 2, 2, 2))
728
- elif scale == 0.5:
729
- layers.append(LwDetrConvNormLayer(config, channel_size, channel_size, 3, 2, activation="relu"))
730
- self.layers = nn.ModuleList(layers)
731
-
732
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
733
- for layer in self.layers:
734
- hidden_states = layer(hidden_states)
735
- return hidden_states
736
-
737
-
738
- class LwDetrScaleProjector(nn.Module):
739
- def __init__(self, config: LwDetrConfig, scale: float):
740
- super().__init__()
741
-
742
- intermediate_dims = [config.backbone_config.hidden_size] * len(config.backbone_config.out_indices)
743
- sampling_layers = []
744
- for channel_size in intermediate_dims:
745
- sampling_layers.append(LwDetrSamplingLayer(config, channel_size, scale))
746
- self.sampling_layers = nn.ModuleList(sampling_layers)
747
-
748
- intermediate_dim = intermediate_dims[-1]
749
- if scale == 2.0:
750
- if intermediate_dim > 512:
751
- intermediate_dim = intermediate_dim // 4
752
- else:
753
- intermediate_dim = intermediate_dim // 2
754
- projector_input_dim = intermediate_dim * len(intermediate_dims)
755
-
756
- self.projector_layer = LwDetrC2FLayer(config, projector_input_dim)
757
- self.layer_norm = LwDetrLayerNorm(config.d_model, data_format="channels_first")
758
-
759
- def forward(self, hidden_states_tuple: tuple[torch.Tensor]) -> torch.Tensor:
760
- sampled_hidden_states = []
761
- for sampling_layer, hidden_states in zip(self.sampling_layers, hidden_states_tuple):
762
- hidden_states = sampling_layer(hidden_states)
763
- sampled_hidden_states.append(hidden_states)
764
- hidden_states = torch.cat(sampled_hidden_states, dim=1)
765
- hidden_states = self.projector_layer(hidden_states)
766
- hidden_states = self.layer_norm(hidden_states)
767
- return hidden_states
768
-
769
-
770
- class LwDetrMultiScaleProjector(nn.Module):
771
- def __init__(self, config: LwDetrConfig):
772
- super().__init__()
773
-
774
- self.config = config
775
- scale_factors = config.projector_scale_factors
776
-
777
- self.scale_layers = nn.ModuleList([LwDetrScaleProjector(config, scale) for scale in scale_factors])
778
-
779
- def forward(self, hidden_states: tuple[torch.Tensor]) -> list[torch.Tensor]:
780
- output_hidden_states = []
781
- for scale_layer in self.scale_layers:
782
- output_hidden_states.append(scale_layer(hidden_states))
783
- return output_hidden_states
784
-
785
-
786
- class LwDetrConvEncoder(nn.Module):
787
- def __init__(self, config: LwDetrConfig):
788
- super().__init__()
789
- self.backbone = LwDetrViTBackbone(config.backbone_config)
790
- self.projector = LwDetrMultiScaleProjector(config)
791
-
792
- def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
793
- # send pixel_values through the model to get list of feature maps
794
- features = self.backbone(pixel_values).feature_maps
795
- features = self.projector(features)
796
- out = []
797
- for feature_map in features:
798
- # downsample pixel_mask to match shape of corresponding feature_map
799
- mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
800
- out.append((feature_map, mask))
801
- return out
802
-
803
-
804
- class LwDetrAttention(nn.Module):
805
- def __init__(self, config: LwDetrConfig, layer_idx: int):
806
- super().__init__()
807
- self.config = config
808
- self.layer_idx = layer_idx
809
- self.head_dim = getattr(config, "head_dim", config.d_model // config.decoder_self_attention_heads)
810
- self.scaling = self.head_dim**-0.5
811
- self.attention_dropout = config.attention_dropout
812
- self.is_causal = False
813
- self.num_key_value_groups = 1
814
-
815
- self.q_proj = nn.Linear(
816
- config.d_model, config.decoder_self_attention_heads * self.head_dim, bias=config.attention_bias
817
- )
818
- self.k_proj = nn.Linear(
819
- config.d_model, config.decoder_self_attention_heads * self.head_dim, bias=config.attention_bias
820
- )
821
- self.v_proj = nn.Linear(
822
- config.d_model, config.decoder_self_attention_heads * self.head_dim, bias=config.attention_bias
823
- )
824
- self.o_proj = nn.Linear(
825
- config.decoder_self_attention_heads * self.head_dim, config.d_model, bias=config.attention_bias
826
- )
827
-
828
- def forward(
829
- self,
830
- hidden_states: torch.Tensor,
831
- position_embeddings: torch.Tensor | None = None,
832
- **kwargs: Unpack[TransformersKwargs],
833
- ) -> tuple[torch.Tensor, torch.Tensor]:
834
- batch_size, seq_len, _ = hidden_states.shape
835
- input_shape = hidden_states.shape[:-1]
836
- hidden_shape = (*input_shape, -1, self.head_dim)
837
-
838
- hidden_states_original = hidden_states
839
- if position_embeddings is not None:
840
- hidden_states = hidden_states if position_embeddings is None else hidden_states + position_embeddings
841
-
842
- if self.training:
843
- # at training, we use group detr technique to add more supervision by using multiple weight-sharing decoders at once for faster convergence
844
- # at inference, we only use one decoder
845
- hidden_states_original = torch.cat(
846
- hidden_states_original.split(seq_len // self.config.group_detr, dim=1), dim=0
847
- )
848
- hidden_states = torch.cat(hidden_states.split(seq_len // self.config.group_detr, dim=1), dim=0)
849
-
850
- query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
851
- key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
852
- value_states = self.v_proj(hidden_states_original).view(hidden_shape).transpose(1, 2)
853
-
854
- attention_interface: Callable = eager_attention_forward
855
- if self.config._attn_implementation != "eager":
856
- attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
857
-
858
- attn_output, attn_weights = attention_interface(
859
- self,
860
- query_states,
861
- key_states,
862
- value_states,
863
- attention_mask=None,
864
- dropout=0.0 if not self.training else self.attention_dropout,
865
- scaling=self.scaling,
866
- **kwargs,
867
- )
868
- attn_output = attn_output.reshape(*input_shape, -1).contiguous()
869
- attn_output = self.o_proj(attn_output)
870
-
871
- if self.training:
872
- attn_output = torch.cat(torch.split(attn_output, batch_size, dim=0), dim=1)
873
-
874
- return attn_output, attn_weights
875
-
876
-
877
- class LwDetrMultiscaleDeformableAttention(DeformableDetrMultiscaleDeformableAttention):
878
- def forward(
879
- self,
880
- hidden_states: torch.Tensor,
881
- attention_mask: torch.Tensor | None = None,
882
- encoder_hidden_states=None,
883
- encoder_attention_mask=None,
884
- position_embeddings: torch.Tensor | None = None,
885
- reference_points=None,
886
- spatial_shapes=None,
887
- spatial_shapes_list=None,
888
- level_start_index=None,
889
- **kwargs: Unpack[TransformersKwargs],
890
- ):
891
- return super().forward(
892
- hidden_states=hidden_states,
893
- attention_mask=attention_mask,
894
- encoder_hidden_states=encoder_hidden_states,
895
- encoder_attention_mask=encoder_attention_mask,
896
- position_embeddings=position_embeddings,
897
- reference_points=reference_points,
898
- spatial_shapes=spatial_shapes,
899
- spatial_shapes_list=spatial_shapes_list,
900
- level_start_index=level_start_index,
901
- **kwargs,
902
- )
903
-
904
-
905
- class LwDetrMLP(nn.Module):
906
- def __init__(self, config: LwDetrConfig):
907
- super().__init__()
908
- self.dropout = config.dropout
909
- self.activation_fn = ACT2FN[config.decoder_activation_function]
910
- self.fc1 = nn.Linear(config.d_model, config.decoder_ffn_dim)
911
- self.fc2 = nn.Linear(config.decoder_ffn_dim, config.d_model)
912
-
913
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
914
- residual = hidden_states
915
- hidden_states = self.fc1(hidden_states)
916
- hidden_states = self.activation_fn(hidden_states)
917
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
918
- hidden_states = self.fc2(hidden_states)
919
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
920
- hidden_states = residual + hidden_states
921
- return hidden_states
922
-
923
-
924
- class LwDetrDecoderLayer(GradientCheckpointingLayer):
925
- def __init__(self, config: LwDetrConfig, layer_idx: int):
926
- nn.Module.__init__(self)
927
-
928
- # self-attention
929
- self.self_attn = LwDetrAttention(config, layer_idx=layer_idx)
930
- self.dropout = config.dropout
931
- self.activation_fn = ACT2FN[config.decoder_activation_function]
932
- self.activation_dropout = config.activation_dropout
933
- self.self_attn_layer_norm = nn.LayerNorm(config.d_model)
934
-
935
- # cross-attention
936
- self.cross_attn = LwDetrMultiscaleDeformableAttention(
937
- config,
938
- num_heads=config.decoder_cross_attention_heads,
939
- n_points=config.decoder_n_points,
940
- )
941
- self.cross_attn_layer_norm = nn.LayerNorm(config.d_model)
942
-
943
- # mlp
944
- self.mlp = LwDetrMLP(config)
945
- self.layer_norm = nn.LayerNorm(config.d_model)
946
-
947
- def forward(
948
- self,
949
- hidden_states: torch.Tensor,
950
- position_embeddings: torch.Tensor | None = None,
951
- reference_points=None,
952
- spatial_shapes=None,
953
- spatial_shapes_list=None,
954
- level_start_index=None,
955
- encoder_hidden_states: torch.Tensor | None = None,
956
- encoder_attention_mask: torch.Tensor | None = None,
957
- **kwargs: Unpack[TransformersKwargs],
958
- ):
959
- self_attention_output, self_attn_weights = self.self_attn(
960
- hidden_states, position_embeddings=position_embeddings, **kwargs
961
- )
962
-
963
- self_attention_output = nn.functional.dropout(self_attention_output, p=self.dropout, training=self.training)
964
- hidden_states = hidden_states + self_attention_output
965
- hidden_states = self.self_attn_layer_norm(hidden_states)
966
-
967
- cross_attention_output, cross_attn_weights = self.cross_attn(
968
- hidden_states=hidden_states,
969
- attention_mask=encoder_attention_mask,
970
- encoder_hidden_states=encoder_hidden_states,
971
- encoder_attention_mask=encoder_attention_mask,
972
- position_embeddings=position_embeddings,
973
- reference_points=reference_points,
974
- spatial_shapes=spatial_shapes,
975
- spatial_shapes_list=spatial_shapes_list,
976
- level_start_index=level_start_index,
977
- **kwargs,
978
- )
979
- cross_attention_output = nn.functional.dropout(cross_attention_output, p=self.dropout, training=self.training)
980
- hidden_states = hidden_states + cross_attention_output
981
- hidden_states = self.cross_attn_layer_norm(hidden_states)
982
-
983
- hidden_states = self.mlp(hidden_states)
984
- hidden_states = self.layer_norm(hidden_states)
985
-
986
- return hidden_states
987
-
988
-
989
- @auto_docstring
990
- class LwDetrPreTrainedModel(PreTrainedModel):
991
- config: LwDetrConfig
992
- base_model_prefix = "model"
993
- main_input_name = "pixel_values"
994
- _no_split_modules = [
995
- r"LwDetrConvEncoder",
996
- r"LwDetrDecoderLayer",
997
- ]
998
- _supports_sdpa = True
999
- _supports_flash_attn = True
1000
- _supports_flex_attn = True
1001
- _supports_attention_backend = True
1002
- _can_record_outputs = {
1003
- "attentions": [LwDetrAttention, LwDetrMultiscaleDeformableAttention],
1004
- "hidden_states": [LwDetrDecoderLayer],
1005
- }
1006
-
1007
- @torch.no_grad()
1008
- def _init_weights(self, module):
1009
- super()._init_weights(module)
1010
-
1011
- if isinstance(module, LwDetrMultiscaleDeformableAttention):
1012
- init.constant_(module.sampling_offsets.weight, 0.0)
1013
- thetas = torch.arange(module.n_heads, dtype=torch.int64).float() * (2.0 * math.pi / module.n_heads)
1014
- grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
1015
- grid_init = (
1016
- (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
1017
- .view(module.n_heads, 1, 1, 2)
1018
- .repeat(1, module.n_levels, module.n_points, 1)
1019
- )
1020
- for i in range(module.n_points):
1021
- grid_init[:, :, i, :] *= i + 1
1022
-
1023
- init.copy_(module.sampling_offsets.bias, grid_init.view(-1))
1024
- init.constant_(module.attention_weights.weight, 0.0)
1025
- init.constant_(module.attention_weights.bias, 0.0)
1026
- init.xavier_uniform_(module.value_proj.weight)
1027
- init.constant_(module.value_proj.bias, 0.0)
1028
- init.xavier_uniform_(module.output_proj.weight)
1029
- init.constant_(module.output_proj.bias, 0.0)
1030
- if hasattr(module, "level_embed"):
1031
- init.normal_(module.level_embed)
1032
- if hasattr(module, "refpoint_embed") and module.refpoint_embed is not None:
1033
- init.constant_(module.refpoint_embed.weight, 0)
1034
- if hasattr(module, "class_embed") and module.class_embed is not None:
1035
- prior_prob = 0.01
1036
- bias_value = -math.log((1 - prior_prob) / prior_prob)
1037
- init.constant_(module.class_embed.bias, bias_value)
1038
- if hasattr(module, "bbox_embed") and module.bbox_embed is not None:
1039
- init.constant_(module.bbox_embed.layers[-1].weight, 0)
1040
- init.constant_(module.bbox_embed.layers[-1].bias, 0)
1041
-
1042
-
1043
- def refine_bboxes(reference_points, deltas):
1044
- reference_points = reference_points.to(deltas.device)
1045
- new_reference_points_cxcy = deltas[..., :2] * reference_points[..., 2:] + reference_points[..., :2]
1046
- new_reference_points_wh = deltas[..., 2:].exp() * reference_points[..., 2:]
1047
- new_reference_points = torch.cat((new_reference_points_cxcy, new_reference_points_wh), -1)
1048
- return new_reference_points
1049
-
1050
-
1051
- @dataclass
1052
- @auto_docstring(
1053
- custom_intro="""
1054
- Base class for outputs of the LwDetrDecoder. This class adds two attributes to
1055
- BaseModelOutputWithCrossAttentions, namely:
1056
- - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
1057
- - a stacked tensor of intermediate reference points.
1058
- """
1059
- )
1060
- class LwDetrDecoderOutput(DeformableDetrDecoderOutput):
1061
- pass
1062
-
1063
-
1064
- class LwDetrDecoder(LwDetrPreTrainedModel):
1065
- """
1066
- Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DeformableDetrDecoderLayer`].
1067
-
1068
- The decoder updates the query embeddings through multiple self-attention and deformable cross-attention layers.
1069
-
1070
- Some tweaks for LwDetr:
1071
-
1072
- - it uses group detr technique at training for faster convergence.
1073
-
1074
- Args:
1075
- config: LwDetrConfig
1076
- """
1077
-
1078
- def __init__(self, config: LwDetrConfig):
1079
- super().__init__(config)
1080
- self.dropout = config.dropout
1081
- self.layers = nn.ModuleList([LwDetrDecoderLayer(config, i) for i in range(config.decoder_layers)])
1082
- self.layernorm = nn.LayerNorm(config.d_model)
1083
-
1084
- self.gradient_checkpointing = False
1085
-
1086
- self.ref_point_head = LwDetrMLPPredictionHead(2 * config.d_model, config.d_model, config.d_model, num_layers=2)
1087
-
1088
- self.post_init()
1089
-
1090
- def get_reference(self, reference_points, valid_ratios):
1091
- # batch_size, num_queries, batch_size, 4
1092
- obj_center = reference_points[..., :4]
1093
-
1094
- # batch_size, num_queries, num_levels, 4
1095
- reference_points_inputs = obj_center[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
1096
-
1097
- # batch_size, num_queries, d_model * 2
1098
- query_sine_embed = gen_sine_position_embeddings(reference_points_inputs[:, :, 0, :], self.config.d_model)
1099
-
1100
- # batch_size, num_queries, d_model
1101
- query_pos = self.ref_point_head(query_sine_embed)
1102
- return reference_points_inputs, query_pos
1103
-
1104
- def forward(
1105
- self,
1106
- inputs_embeds: torch.Tensor | None = None,
1107
- reference_points: torch.Tensor | None = None,
1108
- spatial_shapes: torch.Tensor | None = None,
1109
- spatial_shapes_list: torch.Tensor | None = None,
1110
- level_start_index: torch.Tensor | None = None,
1111
- valid_ratios: torch.Tensor | None = None,
1112
- encoder_hidden_states: torch.Tensor | None = None,
1113
- encoder_attention_mask: torch.Tensor | None = None,
1114
- **kwargs: Unpack[TransformersKwargs],
1115
- ):
1116
- intermediate = ()
1117
- intermediate_reference_points = (reference_points,)
1118
-
1119
- if inputs_embeds is not None:
1120
- hidden_states = inputs_embeds
1121
-
1122
- reference_points_inputs, query_pos = self.get_reference(reference_points, valid_ratios)
1123
-
1124
- for idx, decoder_layer in enumerate(self.layers):
1125
- hidden_states = decoder_layer(
1126
- hidden_states,
1127
- encoder_hidden_states=encoder_hidden_states,
1128
- encoder_attention_mask=encoder_attention_mask,
1129
- position_embeddings=query_pos,
1130
- reference_points=reference_points_inputs,
1131
- spatial_shapes=spatial_shapes,
1132
- spatial_shapes_list=spatial_shapes_list,
1133
- level_start_index=level_start_index,
1134
- **kwargs,
1135
- )
1136
- intermediate_hidden_states = self.layernorm(hidden_states)
1137
- intermediate += (intermediate_hidden_states,)
1138
-
1139
- intermediate = torch.stack(intermediate)
1140
- last_hidden_state = intermediate[-1]
1141
- intermediate_reference_points = torch.stack(intermediate_reference_points)
1142
-
1143
- return LwDetrDecoderOutput(
1144
- last_hidden_state=last_hidden_state,
1145
- intermediate_hidden_states=intermediate,
1146
- intermediate_reference_points=intermediate_reference_points,
1147
- )
1148
-
1149
-
1150
- @dataclass
1151
- @auto_docstring(
1152
- custom_intro="""
1153
- Base class for outputs of the LwDetr backbone-decoder model.
1154
- """
1155
- )
1156
- class LwDetrModelOutput(ModelOutput):
1157
- r"""
1158
- init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
1159
- Initial reference points sent through the Transformer decoder.
1160
- intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
1161
- Stacked intermediate hidden states (output of each layer of the decoder).
1162
- intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
1163
- Stacked intermediate reference points (reference points of each layer of the decoder).
1164
- enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
1165
- Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
1166
- picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
1167
- foreground and background).
1168
- enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
1169
- Logits of predicted bounding boxes coordinates in the first stage.
1170
- """
1171
-
1172
- init_reference_points: torch.FloatTensor | None = None
1173
- last_hidden_state: torch.FloatTensor | None = None
1174
- intermediate_hidden_states: torch.FloatTensor | None = None
1175
- intermediate_reference_points: torch.FloatTensor | None = None
1176
- enc_outputs_class: torch.FloatTensor | None = None
1177
- enc_outputs_coord_logits: torch.FloatTensor | None = None
1178
-
1179
-
1180
- @auto_docstring(
1181
- custom_intro="""
1182
- The bare LW Detr Model (consisting of a backbone and decoder Transformer) outputting raw
1183
- hidden-states without any specific head on top.
1184
- """
1185
- )
1186
- class LwDetrModel(DeformableDetrModel):
1187
- def __init__(self, config: LwDetrConfig):
1188
- LwDetrPreTrainedModel.__init__(config)
1189
-
1190
- # Create backbone + positional encoding
1191
- self.backbone = LwDetrConvEncoder(config)
1192
-
1193
- self.group_detr = config.group_detr
1194
- self.num_queries = config.num_queries
1195
- hidden_dim = config.d_model
1196
- self.reference_point_embed = nn.Embedding(self.num_queries * self.group_detr, 4)
1197
- self.query_feat = nn.Embedding(self.num_queries * self.group_detr, hidden_dim)
1198
-
1199
- self.decoder = LwDetrDecoder(config)
1200
-
1201
- self.enc_output = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for _ in range(self.group_detr)])
1202
- self.enc_output_norm = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(self.group_detr)])
1203
- # Should normally be None and then instantiated in the ForObjectDetection class
1204
- self.enc_out_bbox_embed = nn.ModuleList(
1205
- [LwDetrMLPPredictionHead(config.d_model, config.d_model, 4, num_layers=3) for _ in range(self.group_detr)]
1206
- )
1207
- self.enc_out_class_embed = nn.ModuleList(
1208
- [nn.Linear(config.d_model, config.num_labels) for _ in range(self.group_detr)]
1209
- )
1210
-
1211
- self.post_init()
1212
-
1213
- def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
1214
- """Generate the encoder output proposals from encoded enc_output.
1215
-
1216
- Args:
1217
- enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
1218
- padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
1219
- spatial_shapes (list[tuple[int, int]]): Spatial shapes of the feature maps.
1220
-
1221
- Returns:
1222
- `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
1223
- - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
1224
- directly predict a bounding box. (without the need of a decoder)
1225
- - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
1226
- sigmoid.
1227
- """
1228
- batch_size = enc_output.shape[0]
1229
- proposals = []
1230
- _cur = 0
1231
- for level, (height, width) in enumerate(spatial_shapes):
1232
- mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)
1233
- valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
1234
- valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
1235
-
1236
- grid_y, grid_x = meshgrid(
1237
- torch.linspace(
1238
- 0,
1239
- height - 1,
1240
- height,
1241
- dtype=enc_output.dtype,
1242
- device=enc_output.device,
1243
- ),
1244
- torch.linspace(
1245
- 0,
1246
- width - 1,
1247
- width,
1248
- dtype=enc_output.dtype,
1249
- device=enc_output.device,
1250
- ),
1251
- indexing="ij",
1252
- )
1253
- grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
1254
-
1255
- scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
1256
- grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
1257
- width_height = torch.ones_like(grid) * 0.05 * (2.0**level)
1258
- proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4)
1259
- proposals.append(proposal)
1260
- _cur += height * width
1261
- output_proposals = torch.cat(proposals, 1)
1262
- output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
1263
- output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
1264
- output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
1265
-
1266
- # assign each pixel as an object query
1267
- object_query = enc_output
1268
- object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
1269
- object_query = object_query.masked_fill(~output_proposals_valid, float(0))
1270
- return object_query, output_proposals
1271
-
1272
- @check_model_inputs
1273
- @auto_docstring
1274
- def forward(
1275
- self,
1276
- pixel_values: torch.FloatTensor = None,
1277
- pixel_mask: torch.LongTensor | None = None,
1278
- **kwargs: Unpack[TransformersKwargs],
1279
- ) -> LwDetrModelOutput:
1280
- r"""
1281
- Examples:
1282
-
1283
- ```python
1284
- >>> from transformers import AutoImageProcessor, DeformableDetrModel
1285
- >>> from PIL import Image
1286
- >>> import httpx
1287
- >>> from io import BytesIO
1288
-
1289
- >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1290
- >>> with httpx.stream("GET", url) as response:
1291
- ... image = Image.open(BytesIO(response.read()))
1292
-
1293
- >>> image_processor = AutoImageProcessor.from_pretrained("AnnaZhang/lwdetr_small_60e_coco")
1294
- >>> model = DeformableDetrModel.from_pretrained("AnnaZhang/lwdetr_small_60e_coco")
1295
-
1296
- >>> inputs = image_processor(images=image, return_tensors="pt")
1297
-
1298
- >>> outputs = model(**inputs)
1299
-
1300
- >>> last_hidden_states = outputs.last_hidden_state
1301
- >>> list(last_hidden_states.shape)
1302
- [1, 300, 256]
1303
- ```"""
1304
- batch_size, num_channels, height, width = pixel_values.shape
1305
- device = pixel_values.device
1306
-
1307
- if pixel_mask is None:
1308
- pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
1309
-
1310
- # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
1311
- # First, sent pixel_values + pixel_mask through Backbone to obtain the features
1312
- # which is a list of tuples
1313
- features = self.backbone(pixel_values, pixel_mask)
1314
-
1315
- # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
1316
- sources = []
1317
- masks = []
1318
- for level, (source, mask) in enumerate(features):
1319
- sources.append(source)
1320
- masks.append(mask)
1321
- if mask is None:
1322
- raise ValueError("No attention mask was provided")
1323
-
1324
- if self.training:
1325
- reference_points = self.reference_point_embed.weight
1326
- query_feat = self.query_feat.weight
1327
- else:
1328
- # only use one group in inference
1329
- reference_points = self.reference_point_embed.weight[: self.num_queries]
1330
- query_feat = self.query_feat.weight[: self.num_queries]
1331
-
1332
- # Prepare encoder inputs (by flattening)
1333
- source_flatten = []
1334
- mask_flatten = []
1335
- spatial_shapes_list = []
1336
- for source, mask in zip(sources, masks):
1337
- batch_size, num_channels, height, width = source.shape
1338
- spatial_shape = (height, width)
1339
- spatial_shapes_list.append(spatial_shape)
1340
- source = source.flatten(2).transpose(1, 2)
1341
- mask = mask.flatten(1)
1342
- source_flatten.append(source)
1343
- mask_flatten.append(mask)
1344
- source_flatten = torch.cat(source_flatten, 1)
1345
- mask_flatten = torch.cat(mask_flatten, 1)
1346
- spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
1347
- level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
1348
- valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1)
1349
-
1350
- target = query_feat.unsqueeze(0).expand(batch_size, -1, -1)
1351
- reference_points = reference_points.unsqueeze(0).expand(batch_size, -1, -1)
1352
-
1353
- object_query_embedding, output_proposals = self.gen_encoder_output_proposals(
1354
- source_flatten, ~mask_flatten, spatial_shapes_list
1355
- )
1356
-
1357
- group_detr = self.group_detr if self.training else 1
1358
- topk = self.num_queries
1359
- topk_coords_logits = []
1360
- topk_coords_logits_undetach = []
1361
- object_query_undetach = []
1362
-
1363
- for group_id in range(group_detr):
1364
- group_object_query = self.enc_output[group_id](object_query_embedding)
1365
- group_object_query = self.enc_output_norm[group_id](group_object_query)
1366
-
1367
- group_enc_outputs_class = self.enc_out_class_embed[group_id](group_object_query)
1368
- group_delta_bbox = self.enc_out_bbox_embed[group_id](group_object_query)
1369
- group_enc_outputs_coord = refine_bboxes(output_proposals, group_delta_bbox)
1370
-
1371
- group_topk_proposals = torch.topk(group_enc_outputs_class.max(-1)[0], topk, dim=1)[1]
1372
- group_topk_coords_logits_undetach = torch.gather(
1373
- group_enc_outputs_coord,
1374
- 1,
1375
- group_topk_proposals.unsqueeze(-1).repeat(1, 1, 4),
1376
- )
1377
- group_topk_coords_logits = group_topk_coords_logits_undetach.detach()
1378
- group_object_query_undetach = torch.gather(
1379
- group_object_query, 1, group_topk_proposals.unsqueeze(-1).repeat(1, 1, self.config.d_model)
1380
- )
1381
-
1382
- topk_coords_logits.append(group_topk_coords_logits)
1383
- topk_coords_logits_undetach.append(group_topk_coords_logits_undetach)
1384
- object_query_undetach.append(group_object_query_undetach)
1385
-
1386
- topk_coords_logits = torch.cat(topk_coords_logits, 1)
1387
- topk_coords_logits_undetach = torch.cat(topk_coords_logits_undetach, 1)
1388
- object_query_undetach = torch.cat(object_query_undetach, 1)
1389
-
1390
- enc_outputs_class = object_query_undetach
1391
- enc_outputs_coord_logits = topk_coords_logits
1392
-
1393
- reference_points = refine_bboxes(topk_coords_logits_undetach, reference_points)
1394
-
1395
- init_reference_points = reference_points
1396
- decoder_outputs = self.decoder(
1397
- inputs_embeds=target,
1398
- reference_points=reference_points,
1399
- spatial_shapes=spatial_shapes,
1400
- spatial_shapes_list=spatial_shapes_list,
1401
- level_start_index=level_start_index,
1402
- valid_ratios=valid_ratios,
1403
- encoder_hidden_states=source_flatten,
1404
- encoder_attention_mask=mask_flatten,
1405
- **kwargs,
1406
- )
1407
-
1408
- return LwDetrModelOutput(
1409
- init_reference_points=init_reference_points,
1410
- last_hidden_state=decoder_outputs.last_hidden_state,
1411
- intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
1412
- intermediate_reference_points=decoder_outputs.intermediate_reference_points,
1413
- enc_outputs_class=enc_outputs_class,
1414
- enc_outputs_coord_logits=enc_outputs_coord_logits,
1415
- )
1416
-
1417
-
1418
- class LwDetrMLPPredictionHead(DeformableDetrMLPPredictionHead):
1419
- pass
1420
-
1421
-
1422
- @dataclass
1423
- @auto_docstring(
1424
- custom_intro="""
1425
- Output type of [`LwDetrForObjectDetection`].
1426
- """
1427
- )
1428
- class LwDetrObjectDetectionOutput(ModelOutput):
1429
- r"""
1430
- loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
1431
- Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
1432
- bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
1433
- scale-invariant IoU loss.
1434
- loss_dict (`Dict`, *optional*):
1435
- A dictionary containing the individual losses. Useful for logging.
1436
- logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
1437
- Classification logits (including no-object) for all queries.
1438
- pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
1439
- Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
1440
- values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
1441
- possible padding). You can use [`~DeformableDetrProcessor.post_process_object_detection`] to retrieve the
1442
- unnormalized bounding boxes.
1443
- auxiliary_outputs (`list[Dict]`, *optional*):
1444
- Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
1445
- and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
1446
- `pred_boxes`) for each decoder layer.
1447
- init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
1448
- Initial reference points sent through the Transformer decoder.
1449
- intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
1450
- Stacked intermediate hidden states (output of each layer of the decoder).
1451
- intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
1452
- Stacked intermediate reference points (reference points of each layer of the decoder).
1453
- enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
1454
- Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
1455
- picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
1456
- foreground and background).
1457
- enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
1458
- Logits of predicted bounding boxes coordinates in the first stage.
1459
- """
1460
-
1461
- loss: torch.FloatTensor | None = None
1462
- loss_dict: dict | None = None
1463
- logits: torch.FloatTensor | None = None
1464
- pred_boxes: torch.FloatTensor | None = None
1465
- auxiliary_outputs: list[dict] | None = None
1466
- init_reference_points: torch.FloatTensor | None = None
1467
- last_hidden_state: torch.FloatTensor | None = None
1468
- intermediate_hidden_states: torch.FloatTensor | None = None
1469
- intermediate_reference_points: torch.FloatTensor | None = None
1470
- enc_outputs_class: Any = None
1471
- enc_outputs_coord_logits: torch.FloatTensor | None = None
1472
-
1473
-
1474
- @auto_docstring(
1475
- custom_intro="""
1476
- LW DETR Model (consisting of a backbone and decoder Transformer) with object detection heads on
1477
- top, for tasks such as COCO detection.
1478
- """
1479
- )
1480
- class LwDetrForObjectDetection(DeformableDetrForObjectDetection):
1481
- _tied_weights_keys = None
1482
-
1483
- def __init__(self, config: LwDetrConfig):
1484
- PreTrainedModel.__init__(self, config)
1485
- self.model = LwDetrModel(config)
1486
- self.class_embed = nn.Linear(config.d_model, config.num_labels)
1487
- self.bbox_embed = LwDetrMLPPredictionHead(config.d_model, config.d_model, 4, num_layers=3)
1488
-
1489
- self.post_init()
1490
-
1491
- @check_model_inputs
1492
- @auto_docstring
1493
- def forward(
1494
- self,
1495
- pixel_values: torch.FloatTensor = None,
1496
- pixel_mask: torch.LongTensor | None = None,
1497
- labels: list[dict] | None = None,
1498
- **kwargs: Unpack[TransformersKwargs],
1499
- ) -> LwDetrObjectDetectionOutput:
1500
- r"""
1501
- decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
1502
- Not used by default. Can be used to mask object queries.
1503
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1504
- Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
1505
- can choose to directly pass a flattened representation of an image.
1506
- decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
1507
- Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
1508
- embedded representation.
1509
- labels (`list[Dict]` of len `(batch_size,)`, *optional*):
1510
- Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
1511
- following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
1512
- respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
1513
- in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
1514
-
1515
- Examples:
1516
-
1517
- ```python
1518
- >>> from transformers import AutoImageProcessor, LwDetrForObjectDetection
1519
- >>> from PIL import Image
1520
- >>> import httpx
1521
- >>> from io import BytesIO
1522
-
1523
- >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1524
- >>> with httpx.stream("GET", url) as response:
1525
- ... image = Image.open(BytesIO(response.read()))
1526
-
1527
- >>> image_processor = AutoImageProcessor.from_pretrained("AnnaZhang/lwdetr_small_60e_coco")
1528
- >>> model = LwDetrForObjectDetection.from_pretrained("AnnaZhang/lwdetr_small_60e_coco")
1529
-
1530
- >>> inputs = image_processor(images=image, return_tensors="pt")
1531
- >>> outputs = model(**inputs)
1532
-
1533
- >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
1534
- >>> target_sizes = torch.tensor([image.size[::-1]])
1535
- >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
1536
- ... 0
1537
- ... ]
1538
- >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
1539
- ... box = [round(i, 2) for i in box.tolist()]
1540
- ... print(
1541
- ... f"Detected {model.config.id2label[label.item()]} with confidence "
1542
- ... f"{round(score.item(), 3)} at location {box}"
1543
- ... )
1544
- Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78]
1545
- Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25]
1546
- Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25]
1547
- ```"""
1548
- outputs = self.model(
1549
- pixel_values,
1550
- pixel_mask=pixel_mask,
1551
- **kwargs,
1552
- )
1553
-
1554
- last_hidden_states = outputs.last_hidden_state
1555
- intermediate_reference_points = outputs.intermediate_reference_points
1556
- enc_outputs_class_logits = outputs.enc_outputs_class
1557
- enc_outputs_boxes_logits = outputs.enc_outputs_coord_logits
1558
-
1559
- logits = self.class_embed(last_hidden_states)
1560
- pred_boxes_delta = self.bbox_embed(last_hidden_states)
1561
- pred_boxes = refine_bboxes(intermediate_reference_points[-1], pred_boxes_delta)
1562
-
1563
- enc_outputs_class_logits_list = enc_outputs_class_logits.split(self.config.num_queries, dim=1)
1564
- pred_class = []
1565
- group_detr = self.config.group_detr if self.training else 1
1566
- for group_index in range(group_detr):
1567
- group_pred_class = self.model.enc_out_class_embed[group_index](enc_outputs_class_logits_list[group_index])
1568
- pred_class.append(group_pred_class)
1569
- enc_outputs_class_logits = torch.cat(pred_class, dim=1)
1570
-
1571
- loss, loss_dict, auxiliary_outputs = None, None, None
1572
- if labels is not None:
1573
- outputs_class, outputs_coord = None, None
1574
- if self.config.auxiliary_loss:
1575
- intermediate_hidden_states = outputs.intermediate_hidden_states
1576
- outputs_coord_delta = self.bbox_embed(intermediate_hidden_states)
1577
- outputs_coord = refine_bboxes(intermediate_reference_points, outputs_coord_delta)
1578
- outputs_class = self.class_embed(intermediate_hidden_states)
1579
-
1580
- loss, loss_dict, auxiliary_outputs = self.loss_function(
1581
- logits,
1582
- labels,
1583
- self.device,
1584
- pred_boxes,
1585
- self.config,
1586
- outputs_class,
1587
- outputs_coord,
1588
- enc_outputs_class_logits,
1589
- enc_outputs_boxes_logits,
1590
- )
1591
-
1592
- return LwDetrObjectDetectionOutput(
1593
- loss=loss,
1594
- loss_dict=loss_dict,
1595
- logits=logits,
1596
- pred_boxes=pred_boxes,
1597
- auxiliary_outputs=auxiliary_outputs,
1598
- last_hidden_state=outputs.last_hidden_state,
1599
- intermediate_hidden_states=outputs.intermediate_hidden_states,
1600
- intermediate_reference_points=outputs.intermediate_reference_points,
1601
- init_reference_points=outputs.init_reference_points,
1602
- enc_outputs_class=enc_outputs_class_logits,
1603
- enc_outputs_coord_logits=enc_outputs_boxes_logits,
1604
- )
1605
-
1606
-
1607
- __all__ = [
1608
- "LwDetrConfig",
1609
- "LwDetrPreTrainedModel",
1610
- "LwDetrModel",
1611
- "LwDetrForObjectDetection",
1612
- "LwDetrViTConfig",
1613
- "LwDetrViTPreTrainedModel",
1614
- "LwDetrViTBackbone",
1615
- ]