transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (671) hide show
  1. transformers/__init__.py +20 -1
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/configuration_utils.py +114 -70
  6. transformers/conversion_mapping.py +68 -5
  7. transformers/core_model_loading.py +201 -35
  8. transformers/dependency_versions_table.py +1 -1
  9. transformers/feature_extraction_utils.py +54 -22
  10. transformers/generation/candidate_generator.py +79 -31
  11. transformers/generation/configuration_utils.py +162 -122
  12. transformers/generation/continuous_batching/cache.py +47 -18
  13. transformers/generation/continuous_batching/cache_manager.py +131 -34
  14. transformers/generation/continuous_batching/continuous_api.py +101 -64
  15. transformers/generation/continuous_batching/requests.py +28 -1
  16. transformers/generation/continuous_batching/scheduler.py +11 -4
  17. transformers/generation/stopping_criteria.py +1 -1
  18. transformers/generation/utils.py +108 -110
  19. transformers/generation/watermarking.py +8 -5
  20. transformers/image_processing_base.py +2 -12
  21. transformers/image_processing_utils_fast.py +15 -4
  22. transformers/initialization.py +37 -0
  23. transformers/integrations/__init__.py +12 -0
  24. transformers/integrations/accelerate.py +44 -111
  25. transformers/integrations/aqlm.py +3 -5
  26. transformers/integrations/awq.py +2 -5
  27. transformers/integrations/bitnet.py +5 -8
  28. transformers/integrations/bitsandbytes.py +16 -15
  29. transformers/integrations/deepspeed.py +18 -3
  30. transformers/integrations/eetq.py +3 -5
  31. transformers/integrations/fbgemm_fp8.py +1 -1
  32. transformers/integrations/finegrained_fp8.py +6 -16
  33. transformers/integrations/flash_attention.py +2 -2
  34. transformers/integrations/higgs.py +2 -5
  35. transformers/integrations/hub_kernels.py +23 -5
  36. transformers/integrations/integration_utils.py +35 -0
  37. transformers/integrations/mistral.py +12 -0
  38. transformers/integrations/moe.py +240 -0
  39. transformers/integrations/mxfp4.py +4 -10
  40. transformers/integrations/peft.py +5 -0
  41. transformers/integrations/quanto.py +5 -2
  42. transformers/integrations/spqr.py +3 -5
  43. transformers/integrations/tensor_parallel.py +167 -221
  44. transformers/integrations/vptq.py +3 -5
  45. transformers/modeling_gguf_pytorch_utils.py +66 -19
  46. transformers/modeling_rope_utils.py +78 -81
  47. transformers/modeling_utils.py +583 -503
  48. transformers/models/__init__.py +19 -0
  49. transformers/models/afmoe/modeling_afmoe.py +7 -16
  50. transformers/models/afmoe/modular_afmoe.py +5 -13
  51. transformers/models/aimv2/modeling_aimv2.py +4 -0
  52. transformers/models/aimv2/modular_aimv2.py +4 -0
  53. transformers/models/albert/modeling_albert.py +3 -0
  54. transformers/models/align/modeling_align.py +12 -6
  55. transformers/models/altclip/modeling_altclip.py +7 -3
  56. transformers/models/apertus/modeling_apertus.py +4 -2
  57. transformers/models/apertus/modular_apertus.py +4 -1
  58. transformers/models/arcee/modeling_arcee.py +1 -1
  59. transformers/models/aria/modeling_aria.py +8 -4
  60. transformers/models/aria/modular_aria.py +7 -3
  61. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  62. transformers/models/auto/auto_factory.py +1 -1
  63. transformers/models/auto/configuration_auto.py +27 -0
  64. transformers/models/auto/feature_extraction_auto.py +7 -3
  65. transformers/models/auto/image_processing_auto.py +4 -2
  66. transformers/models/auto/modeling_auto.py +31 -0
  67. transformers/models/auto/processing_auto.py +4 -0
  68. transformers/models/auto/tokenization_auto.py +132 -153
  69. transformers/models/auto/video_processing_auto.py +5 -2
  70. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  71. transformers/models/bamba/modeling_bamba.py +18 -19
  72. transformers/models/bamba/modular_bamba.py +17 -16
  73. transformers/models/bark/modeling_bark.py +9 -0
  74. transformers/models/bart/configuration_bart.py +0 -1
  75. transformers/models/bart/modeling_bart.py +7 -0
  76. transformers/models/beit/image_processing_beit_fast.py +0 -1
  77. transformers/models/bert/modeling_bert.py +3 -0
  78. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  79. transformers/models/big_bird/modeling_big_bird.py +3 -0
  80. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
  81. transformers/models/bit/modeling_bit.py +5 -1
  82. transformers/models/bitnet/modeling_bitnet.py +1 -1
  83. transformers/models/blenderbot/modeling_blenderbot.py +7 -0
  84. transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
  85. transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
  86. transformers/models/blip/modeling_blip.py +2 -0
  87. transformers/models/blip/modeling_blip_text.py +8 -0
  88. transformers/models/blip_2/modeling_blip_2.py +2 -0
  89. transformers/models/bloom/modeling_bloom.py +13 -44
  90. transformers/models/blt/modeling_blt.py +162 -2
  91. transformers/models/blt/modular_blt.py +168 -3
  92. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  93. transformers/models/bridgetower/modeling_bridgetower.py +6 -0
  94. transformers/models/bros/modeling_bros.py +8 -0
  95. transformers/models/camembert/modeling_camembert.py +109 -106
  96. transformers/models/canine/modeling_canine.py +6 -0
  97. transformers/models/canine/tokenization_canine.py +2 -0
  98. transformers/models/chameleon/modeling_chameleon.py +9 -4
  99. transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
  100. transformers/models/clap/feature_extraction_clap.py +2 -2
  101. transformers/models/clap/modeling_clap.py +25 -15
  102. transformers/models/clip/modeling_clip.py +2 -0
  103. transformers/models/clipseg/modeling_clipseg.py +4 -0
  104. transformers/models/clvp/modeling_clvp.py +14 -3
  105. transformers/models/code_llama/tokenization_code_llama.py +1 -1
  106. transformers/models/codegen/modeling_codegen.py +13 -4
  107. transformers/models/cohere/modeling_cohere.py +1 -1
  108. transformers/models/cohere2/modeling_cohere2.py +1 -1
  109. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
  110. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  111. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  112. transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
  113. transformers/models/convbert/modeling_convbert.py +3 -0
  114. transformers/models/convnext/image_processing_convnext.py +2 -2
  115. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  116. transformers/models/csm/generation_csm.py +19 -22
  117. transformers/models/csm/modeling_csm.py +3 -1
  118. transformers/models/csm/modular_csm.py +2 -0
  119. transformers/models/ctrl/modeling_ctrl.py +14 -2
  120. transformers/models/cvt/modeling_cvt.py +5 -1
  121. transformers/models/cwm/modeling_cwm.py +1 -1
  122. transformers/models/d_fine/configuration_d_fine.py +3 -4
  123. transformers/models/d_fine/modeling_d_fine.py +46 -39
  124. transformers/models/d_fine/modular_d_fine.py +15 -4
  125. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  126. transformers/models/dab_detr/modeling_dab_detr.py +1 -1
  127. transformers/models/dac/modeling_dac.py +4 -4
  128. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  129. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  130. transformers/models/dbrx/configuration_dbrx.py +9 -1
  131. transformers/models/dbrx/modeling_dbrx.py +1 -1
  132. transformers/models/deberta/modeling_deberta.py +2 -0
  133. transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
  134. transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
  135. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
  136. transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
  137. transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
  138. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
  139. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  140. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  141. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  142. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  143. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  144. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  145. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  146. transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
  147. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  148. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  149. transformers/models/detr/configuration_detr.py +1 -1
  150. transformers/models/detr/modeling_detr.py +8 -1
  151. transformers/models/dia/generation_dia.py +3 -10
  152. transformers/models/dia/modeling_dia.py +12 -1
  153. transformers/models/dia/modular_dia.py +11 -0
  154. transformers/models/dia/processing_dia.py +1 -1
  155. transformers/models/diffllama/modeling_diffllama.py +3 -3
  156. transformers/models/diffllama/modular_diffllama.py +2 -2
  157. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  158. transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
  159. transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
  160. transformers/models/distilbert/modeling_distilbert.py +11 -9
  161. transformers/models/doge/modeling_doge.py +1 -1
  162. transformers/models/donut/image_processing_donut_fast.py +0 -1
  163. transformers/models/donut/modeling_donut_swin.py +16 -12
  164. transformers/models/dots1/modeling_dots1.py +14 -5
  165. transformers/models/dpt/configuration_dpt.py +1 -1
  166. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  167. transformers/models/dpt/modular_dpt.py +1 -2
  168. transformers/models/edgetam/configuration_edgetam.py +1 -1
  169. transformers/models/edgetam/modeling_edgetam.py +5 -2
  170. transformers/models/edgetam/modular_edgetam.py +15 -14
  171. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
  172. transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
  173. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  174. transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
  175. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  176. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  177. transformers/models/efficientnet/modeling_efficientnet.py +5 -1
  178. transformers/models/electra/modeling_electra.py +7 -0
  179. transformers/models/emu3/modeling_emu3.py +8 -2
  180. transformers/models/emu3/modular_emu3.py +7 -1
  181. transformers/models/encodec/modeling_encodec.py +14 -0
  182. transformers/models/eomt/image_processing_eomt_fast.py +46 -14
  183. transformers/models/eomt/modeling_eomt.py +7 -0
  184. transformers/models/eomt/modular_eomt.py +7 -0
  185. transformers/models/ernie/modeling_ernie.py +6 -0
  186. transformers/models/ernie/modular_ernie.py +6 -0
  187. transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
  188. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
  189. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
  190. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  191. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  192. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  193. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  194. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  195. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  196. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  197. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  198. transformers/models/esm/modeling_esm.py +6 -0
  199. transformers/models/esm/modeling_esmfold.py +6 -1
  200. transformers/models/evolla/modeling_evolla.py +9 -1
  201. transformers/models/evolla/modular_evolla.py +8 -0
  202. transformers/models/exaone4/modeling_exaone4.py +1 -1
  203. transformers/models/falcon/modeling_falcon.py +3 -3
  204. transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
  205. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  206. transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
  207. transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
  208. transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
  209. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
  210. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  211. transformers/models/flaubert/modeling_flaubert.py +14 -15
  212. transformers/models/flava/image_processing_flava_fast.py +0 -2
  213. transformers/models/flava/modeling_flava.py +4 -1
  214. transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
  215. transformers/models/florence2/modeling_florence2.py +20 -3
  216. transformers/models/florence2/modular_florence2.py +13 -0
  217. transformers/models/fnet/modeling_fnet.py +7 -0
  218. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  219. transformers/models/fuyu/modeling_fuyu.py +3 -1
  220. transformers/models/fuyu/processing_fuyu.py +16 -0
  221. transformers/models/gemma/modeling_gemma.py +10 -12
  222. transformers/models/gemma/modular_gemma.py +9 -11
  223. transformers/models/gemma2/modeling_gemma2.py +1 -1
  224. transformers/models/gemma2/modular_gemma2.py +1 -1
  225. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  226. transformers/models/gemma3/modeling_gemma3.py +28 -7
  227. transformers/models/gemma3/modular_gemma3.py +26 -6
  228. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  229. transformers/models/gemma3n/modeling_gemma3n.py +47 -9
  230. transformers/models/gemma3n/modular_gemma3n.py +51 -9
  231. transformers/models/git/modeling_git.py +181 -126
  232. transformers/models/glm/modeling_glm.py +1 -1
  233. transformers/models/glm4/modeling_glm4.py +1 -1
  234. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  235. transformers/models/glm46v/modeling_glm46v.py +3 -1
  236. transformers/models/glm46v/modular_glm46v.py +3 -0
  237. transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
  238. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  239. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  240. transformers/models/glm4v/modeling_glm4v.py +15 -5
  241. transformers/models/glm4v/modular_glm4v.py +11 -3
  242. transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
  243. transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
  244. transformers/models/glmasr/__init__.py +30 -0
  245. transformers/models/glmasr/configuration_glmasr.py +197 -0
  246. transformers/models/glmasr/modeling_glmasr.py +512 -0
  247. transformers/models/glmasr/modular_glmasr.py +433 -0
  248. transformers/models/glmasr/processing_glmasr.py +332 -0
  249. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  250. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  251. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  252. transformers/models/gpt2/modeling_gpt2.py +8 -5
  253. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
  254. transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
  255. transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
  256. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
  257. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  258. transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
  259. transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
  260. transformers/models/gptj/modeling_gptj.py +15 -6
  261. transformers/models/granite/modeling_granite.py +1 -1
  262. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  263. transformers/models/granitemoe/modeling_granitemoe.py +2 -3
  264. transformers/models/granitemoe/modular_granitemoe.py +1 -2
  265. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  266. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
  267. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  268. transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
  269. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  270. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
  271. transformers/models/groupvit/modeling_groupvit.py +6 -1
  272. transformers/models/helium/modeling_helium.py +1 -1
  273. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
  274. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
  275. transformers/models/hubert/modeling_hubert.py +4 -0
  276. transformers/models/hubert/modular_hubert.py +4 -0
  277. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
  278. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  279. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  280. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
  281. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  282. transformers/models/ibert/modeling_ibert.py +16 -0
  283. transformers/models/idefics/modeling_idefics.py +10 -0
  284. transformers/models/idefics2/modeling_idefics2.py +7 -1
  285. transformers/models/idefics3/modeling_idefics3.py +5 -1
  286. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  287. transformers/models/imagegpt/modeling_imagegpt.py +9 -2
  288. transformers/models/instructblip/modeling_instructblip.py +2 -0
  289. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  290. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  291. transformers/models/internvl/modeling_internvl.py +11 -8
  292. transformers/models/internvl/modular_internvl.py +5 -9
  293. transformers/models/internvl/video_processing_internvl.py +0 -1
  294. transformers/models/jais2/__init__.py +27 -0
  295. transformers/models/jais2/configuration_jais2.py +152 -0
  296. transformers/models/jais2/modeling_jais2.py +486 -0
  297. transformers/models/jais2/modular_jais2.py +196 -0
  298. transformers/models/jamba/modeling_jamba.py +24 -19
  299. transformers/models/jamba/modular_jamba.py +17 -17
  300. transformers/models/janus/image_processing_janus_fast.py +0 -1
  301. transformers/models/janus/modeling_janus.py +15 -7
  302. transformers/models/janus/modular_janus.py +16 -7
  303. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  304. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  305. transformers/models/kosmos2/modeling_kosmos2.py +14 -2
  306. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  307. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  308. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
  309. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  310. transformers/models/lasr/configuration_lasr.py +4 -0
  311. transformers/models/lasr/modeling_lasr.py +3 -2
  312. transformers/models/lasr/modular_lasr.py +8 -1
  313. transformers/models/lasr/processing_lasr.py +0 -2
  314. transformers/models/layoutlm/modeling_layoutlm.py +5 -3
  315. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  316. transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
  317. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
  318. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  319. transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
  320. transformers/models/led/modeling_led.py +6 -0
  321. transformers/models/levit/modeling_levit.py +18 -0
  322. transformers/models/lfm2/modeling_lfm2.py +1 -1
  323. transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
  324. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  325. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  326. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  327. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  328. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  329. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  330. transformers/models/lilt/modeling_lilt.py +19 -15
  331. transformers/models/llama/modeling_llama.py +1 -1
  332. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  333. transformers/models/llama4/modeling_llama4.py +8 -4
  334. transformers/models/llava/image_processing_llava_fast.py +0 -1
  335. transformers/models/llava/modeling_llava.py +12 -7
  336. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  337. transformers/models/llava_next/modeling_llava_next.py +7 -3
  338. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  339. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  340. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  341. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  342. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  343. transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
  344. transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
  345. transformers/models/longt5/modeling_longt5.py +0 -4
  346. transformers/models/m2m_100/modeling_m2m_100.py +10 -0
  347. transformers/models/mamba/modeling_mamba.py +2 -1
  348. transformers/models/mamba2/modeling_mamba2.py +24 -23
  349. transformers/models/marian/configuration_marian.py +1 -1
  350. transformers/models/marian/modeling_marian.py +3 -0
  351. transformers/models/markuplm/modeling_markuplm.py +5 -8
  352. transformers/models/mask2former/configuration_mask2former.py +3 -3
  353. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  354. transformers/models/mask2former/modeling_mask2former.py +9 -0
  355. transformers/models/maskformer/configuration_maskformer.py +3 -3
  356. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  357. transformers/models/maskformer/modeling_maskformer.py +9 -1
  358. transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
  359. transformers/models/mbart/configuration_mbart.py +1 -0
  360. transformers/models/mbart/modeling_mbart.py +7 -0
  361. transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
  362. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  363. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  364. transformers/models/mimi/modeling_mimi.py +25 -4
  365. transformers/models/minimax/modeling_minimax.py +16 -3
  366. transformers/models/minimax/modular_minimax.py +12 -1
  367. transformers/models/ministral/modeling_ministral.py +1 -1
  368. transformers/models/ministral3/modeling_ministral3.py +1 -1
  369. transformers/models/mistral/modeling_mistral.py +1 -1
  370. transformers/models/mistral3/modeling_mistral3.py +10 -4
  371. transformers/models/mistral3/modular_mistral3.py +3 -1
  372. transformers/models/mixtral/modeling_mixtral.py +12 -4
  373. transformers/models/mixtral/modular_mixtral.py +6 -2
  374. transformers/models/mlcd/modeling_mlcd.py +6 -0
  375. transformers/models/mlcd/modular_mlcd.py +4 -0
  376. transformers/models/mllama/modeling_mllama.py +13 -2
  377. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  378. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
  379. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  380. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  381. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  382. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  383. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  384. transformers/models/mobilevit/modeling_mobilevit.py +4 -0
  385. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
  386. transformers/models/modernbert/modeling_modernbert.py +12 -1
  387. transformers/models/modernbert/modular_modernbert.py +12 -1
  388. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
  389. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
  390. transformers/models/moonshine/modeling_moonshine.py +1 -1
  391. transformers/models/moshi/modeling_moshi.py +21 -51
  392. transformers/models/mpnet/modeling_mpnet.py +2 -0
  393. transformers/models/mra/modeling_mra.py +4 -1
  394. transformers/models/mt5/configuration_mt5.py +2 -3
  395. transformers/models/mt5/modeling_mt5.py +0 -10
  396. transformers/models/musicgen/modeling_musicgen.py +5 -9
  397. transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
  398. transformers/models/mvp/modeling_mvp.py +7 -0
  399. transformers/models/nanochat/modeling_nanochat.py +1 -1
  400. transformers/models/nemotron/modeling_nemotron.py +3 -3
  401. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  402. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  403. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  404. transformers/models/nougat/tokenization_nougat.py +11 -16
  405. transformers/models/nystromformer/modeling_nystromformer.py +7 -0
  406. transformers/models/olmo/modeling_olmo.py +1 -1
  407. transformers/models/olmo2/modeling_olmo2.py +1 -1
  408. transformers/models/olmo3/modeling_olmo3.py +1 -1
  409. transformers/models/olmoe/modeling_olmoe.py +12 -4
  410. transformers/models/olmoe/modular_olmoe.py +4 -2
  411. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  412. transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
  413. transformers/models/oneformer/configuration_oneformer.py +3 -3
  414. transformers/models/oneformer/modeling_oneformer.py +7 -38
  415. transformers/models/openai/modeling_openai.py +12 -0
  416. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  417. transformers/models/ovis2/modeling_ovis2.py +15 -3
  418. transformers/models/ovis2/modular_ovis2.py +8 -0
  419. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  420. transformers/models/owlv2/modeling_owlv2.py +7 -3
  421. transformers/models/owlv2/modular_owlv2.py +0 -2
  422. transformers/models/owlvit/modeling_owlvit.py +7 -3
  423. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
  424. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
  425. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
  426. transformers/models/paligemma/modeling_paligemma.py +25 -17
  427. transformers/models/parakeet/modeling_parakeet.py +5 -0
  428. transformers/models/parakeet/modular_parakeet.py +5 -0
  429. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  430. transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
  431. transformers/models/patchtst/modeling_patchtst.py +5 -4
  432. transformers/models/pe_audio/__init__.py +30 -0
  433. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  434. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  435. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  436. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  437. transformers/models/pe_audio/processing_pe_audio.py +24 -0
  438. transformers/models/pe_audio_video/__init__.py +29 -0
  439. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  440. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  441. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  442. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  443. transformers/models/pe_video/__init__.py +30 -0
  444. transformers/models/pe_video/configuration_pe_video.py +211 -0
  445. transformers/models/pe_video/modeling_pe_video.py +636 -0
  446. transformers/models/pe_video/modular_pe_video.py +219 -0
  447. transformers/models/pe_video/processing_pe_video.py +10 -0
  448. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  449. transformers/models/pegasus/configuration_pegasus.py +1 -0
  450. transformers/models/pegasus/modeling_pegasus.py +3 -0
  451. transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
  452. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  453. transformers/models/perceiver/modeling_perceiver.py +5 -1
  454. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  455. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  456. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  457. transformers/models/persimmon/modeling_persimmon.py +1 -1
  458. transformers/models/phi/modeling_phi.py +1 -1
  459. transformers/models/phi3/modeling_phi3.py +1 -1
  460. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
  461. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
  462. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  463. transformers/models/phimoe/modeling_phimoe.py +12 -4
  464. transformers/models/phimoe/modular_phimoe.py +1 -1
  465. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  466. transformers/models/pixio/__init__.py +30 -0
  467. transformers/models/pixio/configuration_pixio.py +151 -0
  468. transformers/models/pixio/modeling_pixio.py +507 -0
  469. transformers/models/pixio/modular_pixio.py +404 -0
  470. transformers/models/pixtral/modeling_pixtral.py +1 -1
  471. transformers/models/pixtral/processing_pixtral.py +3 -1
  472. transformers/models/plbart/configuration_plbart.py +1 -0
  473. transformers/models/plbart/modeling_plbart.py +7 -0
  474. transformers/models/plbart/modular_plbart.py +6 -0
  475. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  476. transformers/models/poolformer/modeling_poolformer.py +11 -1
  477. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  478. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  479. transformers/models/prophetnet/modeling_prophetnet.py +2 -1
  480. transformers/models/qwen2/modeling_qwen2.py +1 -1
  481. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
  482. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
  483. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
  484. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
  485. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
  486. transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
  487. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  488. transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
  489. transformers/models/qwen3/modeling_qwen3.py +1 -1
  490. transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
  491. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
  492. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  493. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
  494. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
  495. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  496. transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
  497. transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
  498. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  499. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
  500. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
  501. transformers/models/rag/configuration_rag.py +0 -8
  502. transformers/models/rag/modeling_rag.py +7 -9
  503. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
  504. transformers/models/reformer/modeling_reformer.py +9 -1
  505. transformers/models/regnet/modeling_regnet.py +4 -0
  506. transformers/models/rembert/modeling_rembert.py +7 -1
  507. transformers/models/resnet/modeling_resnet.py +8 -3
  508. transformers/models/roberta/modeling_roberta.py +3 -0
  509. transformers/models/roberta/modular_roberta.py +3 -0
  510. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  511. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  512. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  513. transformers/models/rt_detr/modeling_rt_detr.py +4 -0
  514. transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
  515. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  516. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
  517. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  518. transformers/models/rwkv/modeling_rwkv.py +1 -1
  519. transformers/models/sam/configuration_sam.py +1 -0
  520. transformers/models/sam/image_processing_sam_fast.py +0 -1
  521. transformers/models/sam/modeling_sam.py +4 -1
  522. transformers/models/sam2/configuration_sam2.py +1 -1
  523. transformers/models/sam2/modeling_sam2.py +5 -1
  524. transformers/models/sam2/modular_sam2.py +5 -1
  525. transformers/models/sam2_video/modeling_sam2_video.py +51 -43
  526. transformers/models/sam2_video/modular_sam2_video.py +31 -18
  527. transformers/models/sam3/configuration_sam3.py +21 -1
  528. transformers/models/sam3/modeling_sam3.py +23 -0
  529. transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
  530. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
  531. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  532. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
  533. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  534. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  535. transformers/models/sam3_video/modeling_sam3_video.py +3 -3
  536. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  537. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  538. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  539. transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
  540. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
  541. transformers/models/seed_oss/modeling_seed_oss.py +1 -1
  542. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  543. transformers/models/segformer/modeling_segformer.py +2 -2
  544. transformers/models/segformer/modular_segformer.py +0 -1
  545. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  546. transformers/models/siglip/modeling_siglip.py +24 -2
  547. transformers/models/siglip2/modeling_siglip2.py +63 -41
  548. transformers/models/smollm3/modeling_smollm3.py +1 -1
  549. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  550. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  551. transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
  552. transformers/models/speecht5/modeling_speecht5.py +28 -0
  553. transformers/models/splinter/modeling_splinter.py +9 -3
  554. transformers/models/squeezebert/modeling_squeezebert.py +2 -0
  555. transformers/models/stablelm/modeling_stablelm.py +1 -1
  556. transformers/models/starcoder2/modeling_starcoder2.py +1 -1
  557. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  558. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  559. transformers/models/swiftformer/modeling_swiftformer.py +4 -0
  560. transformers/models/swin/modeling_swin.py +16 -12
  561. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  562. transformers/models/swin2sr/modeling_swin2sr.py +49 -33
  563. transformers/models/swinv2/modeling_swinv2.py +41 -33
  564. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  565. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  566. transformers/models/t5/configuration_t5.py +7 -1
  567. transformers/models/t5/modeling_t5.py +1 -7
  568. transformers/models/t5gemma/modeling_t5gemma.py +1 -1
  569. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  570. transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
  571. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  572. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  573. transformers/models/table_transformer/modeling_table_transformer.py +1 -1
  574. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  575. transformers/models/timesfm/modeling_timesfm.py +12 -0
  576. transformers/models/timesfm/modular_timesfm.py +12 -0
  577. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  578. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  579. transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
  580. transformers/models/trocr/modeling_trocr.py +1 -2
  581. transformers/models/tvp/configuration_tvp.py +5 -1
  582. transformers/models/tvp/modeling_tvp.py +4 -4
  583. transformers/models/udop/configuration_udop.py +1 -0
  584. transformers/models/udop/modeling_udop.py +3 -7
  585. transformers/models/umt5/configuration_umt5.py +2 -2
  586. transformers/models/umt5/modeling_umt5.py +0 -6
  587. transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
  588. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  589. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  590. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  591. transformers/models/video_llava/modeling_video_llava.py +7 -3
  592. transformers/models/vilt/configuration_vilt.py +2 -2
  593. transformers/models/vilt/modeling_vilt.py +7 -0
  594. transformers/models/vipllava/modeling_vipllava.py +7 -3
  595. transformers/models/visual_bert/modeling_visual_bert.py +2 -0
  596. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  597. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  598. transformers/models/vitmatte/modeling_vitmatte.py +4 -0
  599. transformers/models/vitpose/configuration_vitpose.py +1 -1
  600. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  601. transformers/models/voxtral/modeling_voxtral.py +2 -2
  602. transformers/models/voxtral/modular_voxtral.py +2 -2
  603. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
  604. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
  605. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
  606. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  607. transformers/models/whisper/generation_whisper.py +1 -0
  608. transformers/models/whisper/modeling_whisper.py +5 -3
  609. transformers/models/x_clip/modeling_x_clip.py +2 -0
  610. transformers/models/xcodec/modeling_xcodec.py +5 -0
  611. transformers/models/xglm/modeling_xglm.py +10 -0
  612. transformers/models/xlm/modeling_xlm.py +13 -14
  613. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  614. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  615. transformers/models/xlnet/modeling_xlnet.py +3 -1
  616. transformers/models/xmod/modeling_xmod.py +3 -0
  617. transformers/models/yoso/modeling_yoso.py +4 -1
  618. transformers/models/zamba/modeling_zamba.py +2 -1
  619. transformers/models/zamba2/modeling_zamba2.py +3 -2
  620. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  621. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  622. transformers/models/zoedepth/modeling_zoedepth.py +7 -0
  623. transformers/pipelines/__init__.py +9 -6
  624. transformers/pipelines/automatic_speech_recognition.py +20 -12
  625. transformers/pipelines/base.py +1 -1
  626. transformers/pipelines/document_question_answering.py +1 -1
  627. transformers/pipelines/question_answering.py +1 -1
  628. transformers/pipelines/text_to_audio.py +2 -2
  629. transformers/processing_utils.py +127 -56
  630. transformers/quantizers/auto.py +2 -4
  631. transformers/quantizers/base.py +9 -64
  632. transformers/quantizers/quantizer_aqlm.py +1 -18
  633. transformers/quantizers/quantizer_auto_round.py +1 -10
  634. transformers/quantizers/quantizer_awq.py +3 -8
  635. transformers/quantizers/quantizer_bitnet.py +1 -6
  636. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  637. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  638. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  639. transformers/quantizers/quantizer_eetq.py +2 -12
  640. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  641. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  642. transformers/quantizers/quantizer_fp_quant.py +4 -4
  643. transformers/quantizers/quantizer_gptq.py +1 -4
  644. transformers/quantizers/quantizer_higgs.py +2 -6
  645. transformers/quantizers/quantizer_mxfp4.py +2 -28
  646. transformers/quantizers/quantizer_quanto.py +14 -14
  647. transformers/quantizers/quantizer_spqr.py +3 -8
  648. transformers/quantizers/quantizer_torchao.py +28 -124
  649. transformers/quantizers/quantizer_vptq.py +1 -10
  650. transformers/testing_utils.py +28 -12
  651. transformers/tokenization_mistral_common.py +3 -2
  652. transformers/tokenization_utils_base.py +3 -2
  653. transformers/tokenization_utils_tokenizers.py +25 -2
  654. transformers/trainer.py +24 -2
  655. transformers/trainer_callback.py +8 -0
  656. transformers/trainer_seq2seq.py +4 -0
  657. transformers/training_args.py +8 -10
  658. transformers/utils/__init__.py +4 -0
  659. transformers/utils/attention_visualizer.py +4 -4
  660. transformers/utils/auto_docstring.py +34 -25
  661. transformers/utils/generic.py +20 -0
  662. transformers/utils/import_utils.py +51 -9
  663. transformers/utils/kernel_config.py +71 -18
  664. transformers/utils/quantization_config.py +8 -8
  665. transformers/video_processing_utils.py +16 -12
  666. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
  667. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
  668. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
  669. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  670. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
  671. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,151 @@
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/pixio/modular_pixio.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_pixio.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # coding=utf-8
8
+ # Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved.
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+
22
+ from ...configuration_utils import PreTrainedConfig
23
+ from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
24
+
25
+
26
+ class PixioConfig(BackboneConfigMixin, PreTrainedConfig):
27
+ r"""
28
+ This is the configuration class to store the configuration of a [`PixioModel`]. It is used to instantiate a
29
+ Pixio model according to the specified arguments, defining the model architecture. Instantiating a configuration
30
+ with the defaults will yield a similar configuration to that of the ViT
31
+ [facebook/pixio-huge](https://huggingface.co/facebook/pixio-huge) architecture.
32
+
33
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
34
+ documentation from [`PreTrainedConfig`] for more information.
35
+
36
+ Args:
37
+ hidden_size (`int`, *optional*, defaults to 1280):
38
+ Dimensionality of the encoder layers and the pooler layer.
39
+ num_hidden_layers (`int`, *optional*, defaults to 32):
40
+ Number of hidden layers in the Transformer encoder.
41
+ num_attention_heads (`int`, *optional*, defaults to 16):
42
+ Number of attention heads for each attention layer in the Transformer encoder.
43
+ mlp_ratio (`int`, *optional*, defaults to 4):
44
+ Ratio of the hidden size of the MLPs relative to the `hidden_size`.
45
+ n_cls_tokens (`int`, *optional*, defaults to 8):
46
+ Number of class tokens in the Transformer encoder.
47
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
48
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
49
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
50
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
51
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
52
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
53
+ The dropout ratio for the attention probabilities.
54
+ initializer_range (`float`, *optional*, defaults to 0.02):
55
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
56
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
57
+ The epsilon used by the layer normalization layers.
58
+ image_size (`int`, *optional*, defaults to 256):
59
+ The size (resolution) of each image.
60
+ patch_size (`int`, *optional*, defaults to 16):
61
+ The size (resolution) of each patch.
62
+ num_channels (`int`, *optional*, defaults to 3):
63
+ The number of input channels.
64
+ qkv_bias (`bool`, *optional*, defaults to `True`):
65
+ Whether to add a bias to the queries, keys and values.
66
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
67
+ Stochastic depth rate per sample (when applied in the main path of residual layers).
68
+ out_features (`list[str]`, *optional*):
69
+ If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
70
+ (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
71
+ corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
72
+ same order as defined in the `stage_names` attribute.
73
+ out_indices (`list[int]`, *optional*):
74
+ If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
75
+ many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
76
+ If unset and `out_features` is unset, will default to the last stage. Must be in the
77
+ same order as defined in the `stage_names` attribute.
78
+ apply_layernorm (`bool`, *optional*, defaults to `True`):
79
+ Whether to apply layer normalization to the feature maps in case the model is used as backbone.
80
+ reshape_hidden_states (`bool`, *optional*, defaults to `True`):
81
+ Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
82
+ case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
83
+ seq_len, hidden_size)`.
84
+
85
+ Example:
86
+
87
+ ```python
88
+ >>> from transformers import PixioConfig, PixioModel
89
+
90
+ >>> # Initializing a Pixio pixio-huge style configuration
91
+ >>> configuration = PixioConfig()
92
+
93
+ >>> # Initializing a model (with random weights) from the pixio-huge style configuration
94
+ >>> model = PixioModel(configuration)
95
+
96
+ >>> # Accessing the model configuration
97
+ >>> configuration = model.config
98
+ ```"""
99
+
100
+ model_type = "pixio"
101
+
102
+ def __init__(
103
+ self,
104
+ hidden_size=1280,
105
+ num_hidden_layers=32,
106
+ num_attention_heads=16,
107
+ mlp_ratio=4,
108
+ n_cls_tokens=8,
109
+ hidden_act="gelu",
110
+ hidden_dropout_prob=0.0,
111
+ attention_probs_dropout_prob=0.0,
112
+ initializer_range=0.02,
113
+ layer_norm_eps=1e-6,
114
+ image_size=256,
115
+ patch_size=16,
116
+ num_channels=3,
117
+ qkv_bias=True,
118
+ drop_path_rate=0.0,
119
+ out_features=None,
120
+ out_indices=None,
121
+ apply_layernorm=True,
122
+ reshape_hidden_states=True,
123
+ **kwargs,
124
+ ):
125
+ super().__init__(**kwargs)
126
+
127
+ self.hidden_size = hidden_size
128
+ self.num_hidden_layers = num_hidden_layers
129
+ self.num_attention_heads = num_attention_heads
130
+ self.mlp_ratio = mlp_ratio
131
+ self.hidden_act = hidden_act
132
+ self.hidden_dropout_prob = hidden_dropout_prob
133
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
134
+ self.initializer_range = initializer_range
135
+ self.layer_norm_eps = layer_norm_eps
136
+ self.image_size = image_size
137
+ self.patch_size = patch_size
138
+ self.num_channels = num_channels
139
+ self.qkv_bias = qkv_bias
140
+ self.drop_path_rate = drop_path_rate
141
+ self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
142
+ self._out_features, self._out_indices = get_aligned_output_features_output_indices(
143
+ out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
144
+ )
145
+ self.apply_layernorm = apply_layernorm
146
+ self.reshape_hidden_states = reshape_hidden_states
147
+
148
+ self.n_cls_tokens = n_cls_tokens
149
+
150
+
151
+ __all__ = ["PixioConfig"]
@@ -0,0 +1,507 @@
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/pixio/modular_pixio.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_pixio.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # coding=utf-8
8
+ # Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved.
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+
22
+ import collections.abc
23
+ from collections.abc import Callable
24
+ from typing import Optional, Union
25
+
26
+ import torch
27
+ from torch import nn
28
+
29
+ from ... import initialization as init
30
+ from ...activations import ACT2FN
31
+ from ...modeling_layers import GradientCheckpointingLayer
32
+ from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling
33
+ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
34
+ from ...processing_utils import Unpack
35
+ from ...utils import TransformersKwargs, auto_docstring, is_tracing
36
+ from ...utils.backbone_utils import BackboneMixin
37
+ from ...utils.generic import check_model_inputs
38
+ from .configuration_pixio import PixioConfig
39
+
40
+
41
+ class PixioPatchEmbeddings(nn.Module):
42
+ """
43
+ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
44
+ `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
45
+ Transformer.
46
+ """
47
+
48
+ def __init__(self, config: PixioConfig):
49
+ super().__init__()
50
+ image_size, patch_size = config.image_size, config.patch_size
51
+ num_channels, hidden_size = config.num_channels, config.hidden_size
52
+
53
+ image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
54
+ patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
55
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
56
+ self.image_size = image_size
57
+ self.patch_size = patch_size
58
+ self.num_channels = num_channels
59
+ self.num_patches = num_patches
60
+
61
+ self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
62
+
63
+ def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
64
+ batch_size, num_channels, height, width = pixel_values.shape
65
+ if num_channels != self.num_channels:
66
+ raise ValueError(
67
+ "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
68
+ f" Expected {self.num_channels} but got {num_channels}."
69
+ )
70
+ if not interpolate_pos_encoding:
71
+ if height != self.image_size[0] or width != self.image_size[1]:
72
+ raise ValueError(
73
+ f"Input image size ({height}*{width}) doesn't match model"
74
+ f" ({self.image_size[0]}*{self.image_size[1]})."
75
+ )
76
+ embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
77
+ return embeddings
78
+
79
+
80
+ class PixioEmbeddings(nn.Module):
81
+ """
82
+ Construct the CLS tokens, position and patch embeddings.
83
+ """
84
+
85
+ def __init__(self, config: PixioConfig) -> None:
86
+ super().__init__()
87
+
88
+ self.cls_token = nn.Parameter(torch.randn(1, config.n_cls_tokens, config.hidden_size))
89
+ self.mask_token = None
90
+ self.patch_embeddings = PixioPatchEmbeddings(config)
91
+ num_patches = self.patch_embeddings.num_patches
92
+ self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + config.n_cls_tokens, config.hidden_size))
93
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
94
+ self.n_cls_tokens = config.n_cls_tokens
95
+ self.patch_size = config.patch_size
96
+ self.config = config
97
+
98
+ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
99
+ """
100
+ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
101
+ images. This method is also adapted to support tracing and interpolation at torch.float32 precision.
102
+
103
+ Adapted from:
104
+ - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
105
+ - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
106
+ """
107
+ num_patches = embeddings.shape[1] - self.n_cls_tokens
108
+ num_positions = self.position_embeddings.shape[1] - self.n_cls_tokens
109
+
110
+ if not is_tracing() and num_patches == num_positions and height == width:
111
+ return self.position_embeddings
112
+
113
+ class_pos_embed = self.position_embeddings[:, : self.n_cls_tokens]
114
+ patch_pos_embed = self.position_embeddings[:, self.n_cls_tokens :]
115
+
116
+ dim = embeddings.shape[-1]
117
+
118
+ new_height = height // self.patch_size
119
+ new_width = width // self.patch_size
120
+
121
+ sqrt_num_positions = int(num_positions**0.5)
122
+ patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
123
+ patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
124
+ target_dtype = patch_pos_embed.dtype
125
+ patch_pos_embed = nn.functional.interpolate(
126
+ patch_pos_embed.to(torch.float32),
127
+ size=(new_height, new_width),
128
+ mode="bicubic",
129
+ align_corners=False,
130
+ ).to(dtype=target_dtype)
131
+
132
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
133
+
134
+ return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
135
+
136
+ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
137
+ batch_size, _, height, width = pixel_values.shape
138
+ target_dtype = self.patch_embeddings.projection.weight.dtype
139
+ embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
140
+
141
+ cls_tokens = self.cls_token.expand(batch_size, -1, -1)
142
+ embeddings = torch.cat((cls_tokens, embeddings), dim=1)
143
+
144
+ embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
145
+
146
+ embeddings = self.dropout(embeddings)
147
+
148
+ return embeddings
149
+
150
+
151
+ def eager_attention_forward(
152
+ module: nn.Module,
153
+ query: torch.Tensor,
154
+ key: torch.Tensor,
155
+ value: torch.Tensor,
156
+ attention_mask: Optional[torch.Tensor],
157
+ scaling: Optional[float] = None,
158
+ dropout: float = 0.0,
159
+ **kwargs: Unpack[TransformersKwargs],
160
+ ):
161
+ if scaling is None:
162
+ scaling = query.size(-1) ** -0.5
163
+
164
+ # Take the dot product between "query" and "key" to get the raw attention scores.
165
+ attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
166
+
167
+ if attention_mask is not None:
168
+ attention_mask = attention_mask[:, :, :, : key.shape[-2]]
169
+ attn_weights = attn_weights + attention_mask
170
+
171
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
172
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
173
+
174
+ attn_output = torch.matmul(attn_weights, value)
175
+ attn_output = attn_output.transpose(1, 2).contiguous()
176
+
177
+ return attn_output, attn_weights
178
+
179
+
180
+ class PixioSelfAttention(nn.Module):
181
+ def __init__(self, config: PixioConfig):
182
+ super().__init__()
183
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
184
+ raise ValueError(
185
+ f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
186
+ f"heads {config.num_attention_heads}."
187
+ )
188
+
189
+ self.config = config
190
+ self.num_attention_heads = config.num_attention_heads
191
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
192
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
193
+ self.dropout_prob = config.attention_probs_dropout_prob
194
+ self.scaling = self.attention_head_size**-0.5
195
+ self.is_causal = False
196
+
197
+ self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
198
+ self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
199
+ self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
200
+
201
+ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
202
+ batch_size = hidden_states.shape[0]
203
+ new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size
204
+
205
+ key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2)
206
+ value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
207
+ query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
208
+
209
+ attention_interface: Callable = eager_attention_forward
210
+ if self.config._attn_implementation != "eager":
211
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
212
+
213
+ context_layer, attention_probs = attention_interface(
214
+ self,
215
+ query_layer,
216
+ key_layer,
217
+ value_layer,
218
+ None,
219
+ is_causal=self.is_causal,
220
+ scaling=self.scaling,
221
+ dropout=0.0 if not self.training else self.dropout_prob,
222
+ )
223
+
224
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
225
+ context_layer = context_layer.reshape(new_context_layer_shape)
226
+
227
+ return context_layer, attention_probs
228
+
229
+
230
+ class PixioSelfOutput(nn.Module):
231
+ """
232
+ The residual connection is defined in PixioLayer instead of here (as is the case with other models), due to the
233
+ layernorm applied before each block.
234
+ """
235
+
236
+ def __init__(self, config: PixioConfig):
237
+ super().__init__()
238
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
239
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
240
+
241
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
242
+ hidden_states = self.dense(hidden_states)
243
+ hidden_states = self.dropout(hidden_states)
244
+ return hidden_states
245
+
246
+
247
+ class PixioAttention(nn.Module):
248
+ def __init__(self, config: PixioConfig):
249
+ super().__init__()
250
+ self.attention = PixioSelfAttention(config)
251
+ self.output = PixioSelfOutput(config)
252
+
253
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
254
+ self_attn_output, _ = self.attention(hidden_states)
255
+ output = self.output(self_attn_output, hidden_states)
256
+ return output
257
+
258
+
259
+ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
260
+ """
261
+ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
262
+
263
+ """
264
+ if drop_prob == 0.0 or not training:
265
+ return input
266
+ keep_prob = 1 - drop_prob
267
+ shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
268
+ random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
269
+ random_tensor.floor_() # binarize
270
+ output = input.div(keep_prob) * random_tensor
271
+ return output
272
+
273
+
274
+ class PixioDropPath(nn.Module):
275
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
276
+
277
+ def __init__(self, drop_prob: Optional[float] = None) -> None:
278
+ super().__init__()
279
+ self.drop_prob = drop_prob
280
+
281
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
282
+ return drop_path(hidden_states, self.drop_prob, self.training)
283
+
284
+ def extra_repr(self) -> str:
285
+ return f"p={self.drop_prob}"
286
+
287
+
288
+ class PixioMLP(nn.Module):
289
+ def __init__(self, config) -> None:
290
+ super().__init__()
291
+ in_features = out_features = config.hidden_size
292
+ hidden_features = int(config.hidden_size * config.mlp_ratio)
293
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
294
+ if isinstance(config.hidden_act, str):
295
+ self.activation = ACT2FN[config.hidden_act]
296
+ else:
297
+ self.activation = config.hidden_act
298
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
299
+
300
+ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
301
+ hidden_state = self.fc1(hidden_state)
302
+ hidden_state = self.activation(hidden_state)
303
+ hidden_state = self.fc2(hidden_state)
304
+ return hidden_state
305
+
306
+
307
+ class PixioLayer(GradientCheckpointingLayer):
308
+ def __init__(self, config: PixioConfig) -> None:
309
+ super().__init__()
310
+
311
+ self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
312
+ self.attention = PixioAttention(config)
313
+ self.drop_path = PixioDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
314
+
315
+ self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
316
+ self.mlp = PixioMLP(config)
317
+
318
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
319
+ hidden_states_norm = self.norm1(hidden_states)
320
+ self_attention_output = self.attention(hidden_states_norm)
321
+
322
+ hidden_states = self.drop_path(self_attention_output) + hidden_states
323
+
324
+ layer_output = self.norm2(hidden_states)
325
+ layer_output = self.mlp(layer_output)
326
+
327
+ layer_output = self.drop_path(layer_output) + hidden_states
328
+
329
+ return layer_output
330
+
331
+
332
+ class PixioEncoder(nn.Module):
333
+ def __init__(self, config: PixioConfig):
334
+ super().__init__()
335
+ self.config = config
336
+ self.layer = nn.ModuleList([PixioLayer(config) for _ in range(config.num_hidden_layers)])
337
+ self.gradient_checkpointing = False
338
+
339
+ def forward(self, hidden_states: torch.Tensor, output_hidden_states: bool = False) -> BaseModelOutput:
340
+ all_hidden_states = [hidden_states] if output_hidden_states else None
341
+ for i, layer_module in enumerate(self.layer):
342
+ hidden_states = layer_module(hidden_states)
343
+ if all_hidden_states:
344
+ all_hidden_states.append(hidden_states)
345
+
346
+ return BaseModelOutput(
347
+ last_hidden_state=hidden_states,
348
+ hidden_states=tuple(all_hidden_states) if all_hidden_states else None,
349
+ )
350
+
351
+
352
+ @auto_docstring
353
+ class PixioPreTrainedModel(PreTrainedModel):
354
+ config: PixioConfig
355
+ base_model_prefix = "pixio"
356
+ main_input_name = "pixel_values"
357
+ input_modalities = ("image",)
358
+ supports_gradient_checkpointing = True
359
+ _no_split_modules = ["PixioEmbeddings", "PixioLayer"]
360
+ _supports_sdpa = True
361
+ _supports_flash_attn = True
362
+ _supports_flex_attn = True
363
+ _supports_attention_backend = True
364
+ _can_record_outputs = {
365
+ "hidden_states": PixioLayer,
366
+ "attentions": PixioSelfAttention,
367
+ }
368
+
369
+ @torch.no_grad()
370
+ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]):
371
+ """Initialize the weights"""
372
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
373
+ init.trunc_normal_(module.weight, mean=0.0, std=self.config.initializer_range)
374
+ if module.bias is not None:
375
+ init.zeros_(module.bias)
376
+ elif isinstance(module, nn.LayerNorm):
377
+ init.zeros_(module.bias)
378
+ init.ones_(module.weight)
379
+ elif isinstance(module, PixioEmbeddings):
380
+ init.trunc_normal_(module.position_embeddings, mean=0.0, std=self.config.initializer_range)
381
+ init.trunc_normal_(module.cls_token, mean=0.0, std=self.config.initializer_range)
382
+ if module.mask_token is not None:
383
+ init.zeros_(module.mask_token)
384
+
385
+
386
+ @auto_docstring
387
+ class PixioModel(PixioPreTrainedModel):
388
+ def __init__(self, config: PixioConfig):
389
+ super().__init__(config)
390
+ self.config = config
391
+
392
+ self.embeddings = PixioEmbeddings(config)
393
+ self.encoder = PixioEncoder(config)
394
+
395
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
396
+
397
+ self.post_init()
398
+
399
+ def get_input_embeddings(self) -> PixioPatchEmbeddings:
400
+ return self.embeddings.patch_embeddings
401
+
402
+ @check_model_inputs(tie_last_hidden_states=False)
403
+ @auto_docstring
404
+ def forward(
405
+ self,
406
+ pixel_values: Optional[torch.Tensor] = None,
407
+ output_hidden_states: Optional[bool] = None,
408
+ **kwargs,
409
+ ) -> BaseModelOutputWithPooling:
410
+ if output_hidden_states is None:
411
+ output_hidden_states = self.config.output_hidden_states
412
+
413
+ if pixel_values is None:
414
+ raise ValueError("You have to specify pixel_values")
415
+
416
+ embedding_output = self.embeddings(pixel_values)
417
+
418
+ encoder_outputs: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=output_hidden_states)
419
+ sequence_output = encoder_outputs.last_hidden_state
420
+ sequence_output = self.layernorm(sequence_output)
421
+ pooled_output = sequence_output[:, : self.embeddings.n_cls_tokens, :].mean(dim=1)
422
+
423
+ return BaseModelOutputWithPooling(
424
+ last_hidden_state=sequence_output,
425
+ pooler_output=pooled_output,
426
+ hidden_states=encoder_outputs.hidden_states,
427
+ )
428
+
429
+
430
+ @auto_docstring(
431
+ custom_intro="""
432
+ Pixio backbone, to be used with frameworks like DETR and MaskFormer.
433
+ """
434
+ )
435
+ class PixioBackbone(PixioPreTrainedModel, BackboneMixin):
436
+ def __init__(self, config):
437
+ super().__init__(config)
438
+ super()._init_backbone(config)
439
+
440
+ self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
441
+ self.embeddings = PixioEmbeddings(config)
442
+ self.encoder = PixioEncoder(config)
443
+
444
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
445
+
446
+ # Initialize weights and apply final processing
447
+ self.post_init()
448
+
449
+ def get_input_embeddings(self) -> PixioPatchEmbeddings:
450
+ return self.embeddings.patch_embeddings
451
+
452
+ @check_model_inputs
453
+ @auto_docstring
454
+ def forward(
455
+ self, pixel_values: torch.Tensor, output_hidden_states: Optional[bool] = None, **kwargs
456
+ ) -> BackboneOutput:
457
+ r"""
458
+ Examples:
459
+
460
+ ```python
461
+ >>> from transformers import AutoImageProcessor, AutoBackbone
462
+ >>> import torch
463
+ >>> from PIL import Image
464
+ >>> import requests
465
+
466
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
467
+ >>> image = Image.open(requests.get(url, stream=True).raw)
468
+
469
+ >>> processor = AutoImageProcessor.from_pretrained("facebook/pixio-huge")
470
+ >>> model = AutoBackbone.from_pretrained(
471
+ ... "facebook/pixio-huge", out_features=["stage7", "stage15", "stage23", "stage31"]
472
+ ... )
473
+
474
+ >>> inputs = processor(image, return_tensors="pt")
475
+
476
+ >>> outputs = model(**inputs)
477
+ >>> feature_maps = outputs.feature_maps
478
+ >>> list(feature_maps[-1].shape)
479
+ [1, 1280, 16, 16]
480
+ ```"""
481
+ if output_hidden_states is None:
482
+ output_hidden_states = self.config.output_hidden_states
483
+
484
+ embedding_output = self.embeddings(pixel_values)
485
+ output: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=True)
486
+ hidden_states = output.hidden_states
487
+
488
+ feature_maps = []
489
+ for stage, hidden_state in zip(self.stage_names, hidden_states):
490
+ if stage in self.out_features:
491
+ if self.config.apply_layernorm:
492
+ hidden_state = self.layernorm(hidden_state)
493
+ if self.config.reshape_hidden_states:
494
+ hidden_state = hidden_state[:, self.embeddings.n_cls_tokens :]
495
+ batch_size, _, height, width = pixel_values.shape
496
+ patch_size = self.config.patch_size
497
+ hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
498
+ hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
499
+ feature_maps.append(hidden_state)
500
+
501
+ return BackboneOutput(
502
+ feature_maps=tuple(feature_maps),
503
+ hidden_states=hidden_states if output_hidden_states else None,
504
+ )
505
+
506
+
507
+ __all__ = ["PixioModel", "PixioPreTrainedModel", "PixioBackbone"]