transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (671) hide show
  1. transformers/__init__.py +20 -1
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/configuration_utils.py +114 -70
  6. transformers/conversion_mapping.py +68 -5
  7. transformers/core_model_loading.py +201 -35
  8. transformers/dependency_versions_table.py +1 -1
  9. transformers/feature_extraction_utils.py +54 -22
  10. transformers/generation/candidate_generator.py +79 -31
  11. transformers/generation/configuration_utils.py +162 -122
  12. transformers/generation/continuous_batching/cache.py +47 -18
  13. transformers/generation/continuous_batching/cache_manager.py +131 -34
  14. transformers/generation/continuous_batching/continuous_api.py +101 -64
  15. transformers/generation/continuous_batching/requests.py +28 -1
  16. transformers/generation/continuous_batching/scheduler.py +11 -4
  17. transformers/generation/stopping_criteria.py +1 -1
  18. transformers/generation/utils.py +108 -110
  19. transformers/generation/watermarking.py +8 -5
  20. transformers/image_processing_base.py +2 -12
  21. transformers/image_processing_utils_fast.py +15 -4
  22. transformers/initialization.py +37 -0
  23. transformers/integrations/__init__.py +12 -0
  24. transformers/integrations/accelerate.py +44 -111
  25. transformers/integrations/aqlm.py +3 -5
  26. transformers/integrations/awq.py +2 -5
  27. transformers/integrations/bitnet.py +5 -8
  28. transformers/integrations/bitsandbytes.py +16 -15
  29. transformers/integrations/deepspeed.py +18 -3
  30. transformers/integrations/eetq.py +3 -5
  31. transformers/integrations/fbgemm_fp8.py +1 -1
  32. transformers/integrations/finegrained_fp8.py +6 -16
  33. transformers/integrations/flash_attention.py +2 -2
  34. transformers/integrations/higgs.py +2 -5
  35. transformers/integrations/hub_kernels.py +23 -5
  36. transformers/integrations/integration_utils.py +35 -0
  37. transformers/integrations/mistral.py +12 -0
  38. transformers/integrations/moe.py +240 -0
  39. transformers/integrations/mxfp4.py +4 -10
  40. transformers/integrations/peft.py +5 -0
  41. transformers/integrations/quanto.py +5 -2
  42. transformers/integrations/spqr.py +3 -5
  43. transformers/integrations/tensor_parallel.py +167 -221
  44. transformers/integrations/vptq.py +3 -5
  45. transformers/modeling_gguf_pytorch_utils.py +66 -19
  46. transformers/modeling_rope_utils.py +78 -81
  47. transformers/modeling_utils.py +583 -503
  48. transformers/models/__init__.py +19 -0
  49. transformers/models/afmoe/modeling_afmoe.py +7 -16
  50. transformers/models/afmoe/modular_afmoe.py +5 -13
  51. transformers/models/aimv2/modeling_aimv2.py +4 -0
  52. transformers/models/aimv2/modular_aimv2.py +4 -0
  53. transformers/models/albert/modeling_albert.py +3 -0
  54. transformers/models/align/modeling_align.py +12 -6
  55. transformers/models/altclip/modeling_altclip.py +7 -3
  56. transformers/models/apertus/modeling_apertus.py +4 -2
  57. transformers/models/apertus/modular_apertus.py +4 -1
  58. transformers/models/arcee/modeling_arcee.py +1 -1
  59. transformers/models/aria/modeling_aria.py +8 -4
  60. transformers/models/aria/modular_aria.py +7 -3
  61. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  62. transformers/models/auto/auto_factory.py +1 -1
  63. transformers/models/auto/configuration_auto.py +27 -0
  64. transformers/models/auto/feature_extraction_auto.py +7 -3
  65. transformers/models/auto/image_processing_auto.py +4 -2
  66. transformers/models/auto/modeling_auto.py +31 -0
  67. transformers/models/auto/processing_auto.py +4 -0
  68. transformers/models/auto/tokenization_auto.py +132 -153
  69. transformers/models/auto/video_processing_auto.py +5 -2
  70. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  71. transformers/models/bamba/modeling_bamba.py +18 -19
  72. transformers/models/bamba/modular_bamba.py +17 -16
  73. transformers/models/bark/modeling_bark.py +9 -0
  74. transformers/models/bart/configuration_bart.py +0 -1
  75. transformers/models/bart/modeling_bart.py +7 -0
  76. transformers/models/beit/image_processing_beit_fast.py +0 -1
  77. transformers/models/bert/modeling_bert.py +3 -0
  78. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  79. transformers/models/big_bird/modeling_big_bird.py +3 -0
  80. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
  81. transformers/models/bit/modeling_bit.py +5 -1
  82. transformers/models/bitnet/modeling_bitnet.py +1 -1
  83. transformers/models/blenderbot/modeling_blenderbot.py +7 -0
  84. transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
  85. transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
  86. transformers/models/blip/modeling_blip.py +2 -0
  87. transformers/models/blip/modeling_blip_text.py +8 -0
  88. transformers/models/blip_2/modeling_blip_2.py +2 -0
  89. transformers/models/bloom/modeling_bloom.py +13 -44
  90. transformers/models/blt/modeling_blt.py +162 -2
  91. transformers/models/blt/modular_blt.py +168 -3
  92. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  93. transformers/models/bridgetower/modeling_bridgetower.py +6 -0
  94. transformers/models/bros/modeling_bros.py +8 -0
  95. transformers/models/camembert/modeling_camembert.py +109 -106
  96. transformers/models/canine/modeling_canine.py +6 -0
  97. transformers/models/canine/tokenization_canine.py +2 -0
  98. transformers/models/chameleon/modeling_chameleon.py +9 -4
  99. transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
  100. transformers/models/clap/feature_extraction_clap.py +2 -2
  101. transformers/models/clap/modeling_clap.py +25 -15
  102. transformers/models/clip/modeling_clip.py +2 -0
  103. transformers/models/clipseg/modeling_clipseg.py +4 -0
  104. transformers/models/clvp/modeling_clvp.py +14 -3
  105. transformers/models/code_llama/tokenization_code_llama.py +1 -1
  106. transformers/models/codegen/modeling_codegen.py +13 -4
  107. transformers/models/cohere/modeling_cohere.py +1 -1
  108. transformers/models/cohere2/modeling_cohere2.py +1 -1
  109. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
  110. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  111. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  112. transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
  113. transformers/models/convbert/modeling_convbert.py +3 -0
  114. transformers/models/convnext/image_processing_convnext.py +2 -2
  115. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  116. transformers/models/csm/generation_csm.py +19 -22
  117. transformers/models/csm/modeling_csm.py +3 -1
  118. transformers/models/csm/modular_csm.py +2 -0
  119. transformers/models/ctrl/modeling_ctrl.py +14 -2
  120. transformers/models/cvt/modeling_cvt.py +5 -1
  121. transformers/models/cwm/modeling_cwm.py +1 -1
  122. transformers/models/d_fine/configuration_d_fine.py +3 -4
  123. transformers/models/d_fine/modeling_d_fine.py +46 -39
  124. transformers/models/d_fine/modular_d_fine.py +15 -4
  125. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  126. transformers/models/dab_detr/modeling_dab_detr.py +1 -1
  127. transformers/models/dac/modeling_dac.py +4 -4
  128. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  129. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  130. transformers/models/dbrx/configuration_dbrx.py +9 -1
  131. transformers/models/dbrx/modeling_dbrx.py +1 -1
  132. transformers/models/deberta/modeling_deberta.py +2 -0
  133. transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
  134. transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
  135. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
  136. transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
  137. transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
  138. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
  139. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  140. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  141. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  142. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  143. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  144. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  145. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  146. transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
  147. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  148. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  149. transformers/models/detr/configuration_detr.py +1 -1
  150. transformers/models/detr/modeling_detr.py +8 -1
  151. transformers/models/dia/generation_dia.py +3 -10
  152. transformers/models/dia/modeling_dia.py +12 -1
  153. transformers/models/dia/modular_dia.py +11 -0
  154. transformers/models/dia/processing_dia.py +1 -1
  155. transformers/models/diffllama/modeling_diffllama.py +3 -3
  156. transformers/models/diffllama/modular_diffllama.py +2 -2
  157. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  158. transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
  159. transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
  160. transformers/models/distilbert/modeling_distilbert.py +11 -9
  161. transformers/models/doge/modeling_doge.py +1 -1
  162. transformers/models/donut/image_processing_donut_fast.py +0 -1
  163. transformers/models/donut/modeling_donut_swin.py +16 -12
  164. transformers/models/dots1/modeling_dots1.py +14 -5
  165. transformers/models/dpt/configuration_dpt.py +1 -1
  166. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  167. transformers/models/dpt/modular_dpt.py +1 -2
  168. transformers/models/edgetam/configuration_edgetam.py +1 -1
  169. transformers/models/edgetam/modeling_edgetam.py +5 -2
  170. transformers/models/edgetam/modular_edgetam.py +15 -14
  171. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
  172. transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
  173. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  174. transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
  175. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  176. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  177. transformers/models/efficientnet/modeling_efficientnet.py +5 -1
  178. transformers/models/electra/modeling_electra.py +7 -0
  179. transformers/models/emu3/modeling_emu3.py +8 -2
  180. transformers/models/emu3/modular_emu3.py +7 -1
  181. transformers/models/encodec/modeling_encodec.py +14 -0
  182. transformers/models/eomt/image_processing_eomt_fast.py +46 -14
  183. transformers/models/eomt/modeling_eomt.py +7 -0
  184. transformers/models/eomt/modular_eomt.py +7 -0
  185. transformers/models/ernie/modeling_ernie.py +6 -0
  186. transformers/models/ernie/modular_ernie.py +6 -0
  187. transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
  188. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
  189. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
  190. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  191. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  192. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  193. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  194. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  195. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  196. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  197. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  198. transformers/models/esm/modeling_esm.py +6 -0
  199. transformers/models/esm/modeling_esmfold.py +6 -1
  200. transformers/models/evolla/modeling_evolla.py +9 -1
  201. transformers/models/evolla/modular_evolla.py +8 -0
  202. transformers/models/exaone4/modeling_exaone4.py +1 -1
  203. transformers/models/falcon/modeling_falcon.py +3 -3
  204. transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
  205. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  206. transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
  207. transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
  208. transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
  209. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
  210. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  211. transformers/models/flaubert/modeling_flaubert.py +14 -15
  212. transformers/models/flava/image_processing_flava_fast.py +0 -2
  213. transformers/models/flava/modeling_flava.py +4 -1
  214. transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
  215. transformers/models/florence2/modeling_florence2.py +20 -3
  216. transformers/models/florence2/modular_florence2.py +13 -0
  217. transformers/models/fnet/modeling_fnet.py +7 -0
  218. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  219. transformers/models/fuyu/modeling_fuyu.py +3 -1
  220. transformers/models/fuyu/processing_fuyu.py +16 -0
  221. transformers/models/gemma/modeling_gemma.py +10 -12
  222. transformers/models/gemma/modular_gemma.py +9 -11
  223. transformers/models/gemma2/modeling_gemma2.py +1 -1
  224. transformers/models/gemma2/modular_gemma2.py +1 -1
  225. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  226. transformers/models/gemma3/modeling_gemma3.py +28 -7
  227. transformers/models/gemma3/modular_gemma3.py +26 -6
  228. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  229. transformers/models/gemma3n/modeling_gemma3n.py +47 -9
  230. transformers/models/gemma3n/modular_gemma3n.py +51 -9
  231. transformers/models/git/modeling_git.py +181 -126
  232. transformers/models/glm/modeling_glm.py +1 -1
  233. transformers/models/glm4/modeling_glm4.py +1 -1
  234. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  235. transformers/models/glm46v/modeling_glm46v.py +3 -1
  236. transformers/models/glm46v/modular_glm46v.py +3 -0
  237. transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
  238. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  239. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  240. transformers/models/glm4v/modeling_glm4v.py +15 -5
  241. transformers/models/glm4v/modular_glm4v.py +11 -3
  242. transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
  243. transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
  244. transformers/models/glmasr/__init__.py +30 -0
  245. transformers/models/glmasr/configuration_glmasr.py +197 -0
  246. transformers/models/glmasr/modeling_glmasr.py +512 -0
  247. transformers/models/glmasr/modular_glmasr.py +433 -0
  248. transformers/models/glmasr/processing_glmasr.py +332 -0
  249. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  250. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  251. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  252. transformers/models/gpt2/modeling_gpt2.py +8 -5
  253. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
  254. transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
  255. transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
  256. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
  257. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  258. transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
  259. transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
  260. transformers/models/gptj/modeling_gptj.py +15 -6
  261. transformers/models/granite/modeling_granite.py +1 -1
  262. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  263. transformers/models/granitemoe/modeling_granitemoe.py +2 -3
  264. transformers/models/granitemoe/modular_granitemoe.py +1 -2
  265. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  266. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
  267. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  268. transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
  269. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  270. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
  271. transformers/models/groupvit/modeling_groupvit.py +6 -1
  272. transformers/models/helium/modeling_helium.py +1 -1
  273. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
  274. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
  275. transformers/models/hubert/modeling_hubert.py +4 -0
  276. transformers/models/hubert/modular_hubert.py +4 -0
  277. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
  278. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  279. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  280. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
  281. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  282. transformers/models/ibert/modeling_ibert.py +16 -0
  283. transformers/models/idefics/modeling_idefics.py +10 -0
  284. transformers/models/idefics2/modeling_idefics2.py +7 -1
  285. transformers/models/idefics3/modeling_idefics3.py +5 -1
  286. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  287. transformers/models/imagegpt/modeling_imagegpt.py +9 -2
  288. transformers/models/instructblip/modeling_instructblip.py +2 -0
  289. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  290. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  291. transformers/models/internvl/modeling_internvl.py +11 -8
  292. transformers/models/internvl/modular_internvl.py +5 -9
  293. transformers/models/internvl/video_processing_internvl.py +0 -1
  294. transformers/models/jais2/__init__.py +27 -0
  295. transformers/models/jais2/configuration_jais2.py +152 -0
  296. transformers/models/jais2/modeling_jais2.py +486 -0
  297. transformers/models/jais2/modular_jais2.py +196 -0
  298. transformers/models/jamba/modeling_jamba.py +24 -19
  299. transformers/models/jamba/modular_jamba.py +17 -17
  300. transformers/models/janus/image_processing_janus_fast.py +0 -1
  301. transformers/models/janus/modeling_janus.py +15 -7
  302. transformers/models/janus/modular_janus.py +16 -7
  303. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  304. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  305. transformers/models/kosmos2/modeling_kosmos2.py +14 -2
  306. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  307. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  308. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
  309. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  310. transformers/models/lasr/configuration_lasr.py +4 -0
  311. transformers/models/lasr/modeling_lasr.py +3 -2
  312. transformers/models/lasr/modular_lasr.py +8 -1
  313. transformers/models/lasr/processing_lasr.py +0 -2
  314. transformers/models/layoutlm/modeling_layoutlm.py +5 -3
  315. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  316. transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
  317. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
  318. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  319. transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
  320. transformers/models/led/modeling_led.py +6 -0
  321. transformers/models/levit/modeling_levit.py +18 -0
  322. transformers/models/lfm2/modeling_lfm2.py +1 -1
  323. transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
  324. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  325. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  326. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  327. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  328. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  329. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  330. transformers/models/lilt/modeling_lilt.py +19 -15
  331. transformers/models/llama/modeling_llama.py +1 -1
  332. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  333. transformers/models/llama4/modeling_llama4.py +8 -4
  334. transformers/models/llava/image_processing_llava_fast.py +0 -1
  335. transformers/models/llava/modeling_llava.py +12 -7
  336. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  337. transformers/models/llava_next/modeling_llava_next.py +7 -3
  338. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  339. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  340. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  341. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  342. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  343. transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
  344. transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
  345. transformers/models/longt5/modeling_longt5.py +0 -4
  346. transformers/models/m2m_100/modeling_m2m_100.py +10 -0
  347. transformers/models/mamba/modeling_mamba.py +2 -1
  348. transformers/models/mamba2/modeling_mamba2.py +24 -23
  349. transformers/models/marian/configuration_marian.py +1 -1
  350. transformers/models/marian/modeling_marian.py +3 -0
  351. transformers/models/markuplm/modeling_markuplm.py +5 -8
  352. transformers/models/mask2former/configuration_mask2former.py +3 -3
  353. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  354. transformers/models/mask2former/modeling_mask2former.py +9 -0
  355. transformers/models/maskformer/configuration_maskformer.py +3 -3
  356. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  357. transformers/models/maskformer/modeling_maskformer.py +9 -1
  358. transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
  359. transformers/models/mbart/configuration_mbart.py +1 -0
  360. transformers/models/mbart/modeling_mbart.py +7 -0
  361. transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
  362. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  363. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  364. transformers/models/mimi/modeling_mimi.py +25 -4
  365. transformers/models/minimax/modeling_minimax.py +16 -3
  366. transformers/models/minimax/modular_minimax.py +12 -1
  367. transformers/models/ministral/modeling_ministral.py +1 -1
  368. transformers/models/ministral3/modeling_ministral3.py +1 -1
  369. transformers/models/mistral/modeling_mistral.py +1 -1
  370. transformers/models/mistral3/modeling_mistral3.py +10 -4
  371. transformers/models/mistral3/modular_mistral3.py +3 -1
  372. transformers/models/mixtral/modeling_mixtral.py +12 -4
  373. transformers/models/mixtral/modular_mixtral.py +6 -2
  374. transformers/models/mlcd/modeling_mlcd.py +6 -0
  375. transformers/models/mlcd/modular_mlcd.py +4 -0
  376. transformers/models/mllama/modeling_mllama.py +13 -2
  377. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  378. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
  379. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  380. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  381. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  382. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  383. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  384. transformers/models/mobilevit/modeling_mobilevit.py +4 -0
  385. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
  386. transformers/models/modernbert/modeling_modernbert.py +12 -1
  387. transformers/models/modernbert/modular_modernbert.py +12 -1
  388. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
  389. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
  390. transformers/models/moonshine/modeling_moonshine.py +1 -1
  391. transformers/models/moshi/modeling_moshi.py +21 -51
  392. transformers/models/mpnet/modeling_mpnet.py +2 -0
  393. transformers/models/mra/modeling_mra.py +4 -1
  394. transformers/models/mt5/configuration_mt5.py +2 -3
  395. transformers/models/mt5/modeling_mt5.py +0 -10
  396. transformers/models/musicgen/modeling_musicgen.py +5 -9
  397. transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
  398. transformers/models/mvp/modeling_mvp.py +7 -0
  399. transformers/models/nanochat/modeling_nanochat.py +1 -1
  400. transformers/models/nemotron/modeling_nemotron.py +3 -3
  401. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  402. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  403. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  404. transformers/models/nougat/tokenization_nougat.py +11 -16
  405. transformers/models/nystromformer/modeling_nystromformer.py +7 -0
  406. transformers/models/olmo/modeling_olmo.py +1 -1
  407. transformers/models/olmo2/modeling_olmo2.py +1 -1
  408. transformers/models/olmo3/modeling_olmo3.py +1 -1
  409. transformers/models/olmoe/modeling_olmoe.py +12 -4
  410. transformers/models/olmoe/modular_olmoe.py +4 -2
  411. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  412. transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
  413. transformers/models/oneformer/configuration_oneformer.py +3 -3
  414. transformers/models/oneformer/modeling_oneformer.py +7 -38
  415. transformers/models/openai/modeling_openai.py +12 -0
  416. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  417. transformers/models/ovis2/modeling_ovis2.py +15 -3
  418. transformers/models/ovis2/modular_ovis2.py +8 -0
  419. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  420. transformers/models/owlv2/modeling_owlv2.py +7 -3
  421. transformers/models/owlv2/modular_owlv2.py +0 -2
  422. transformers/models/owlvit/modeling_owlvit.py +7 -3
  423. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
  424. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
  425. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
  426. transformers/models/paligemma/modeling_paligemma.py +25 -17
  427. transformers/models/parakeet/modeling_parakeet.py +5 -0
  428. transformers/models/parakeet/modular_parakeet.py +5 -0
  429. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  430. transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
  431. transformers/models/patchtst/modeling_patchtst.py +5 -4
  432. transformers/models/pe_audio/__init__.py +30 -0
  433. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  434. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  435. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  436. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  437. transformers/models/pe_audio/processing_pe_audio.py +24 -0
  438. transformers/models/pe_audio_video/__init__.py +29 -0
  439. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  440. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  441. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  442. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  443. transformers/models/pe_video/__init__.py +30 -0
  444. transformers/models/pe_video/configuration_pe_video.py +211 -0
  445. transformers/models/pe_video/modeling_pe_video.py +636 -0
  446. transformers/models/pe_video/modular_pe_video.py +219 -0
  447. transformers/models/pe_video/processing_pe_video.py +10 -0
  448. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  449. transformers/models/pegasus/configuration_pegasus.py +1 -0
  450. transformers/models/pegasus/modeling_pegasus.py +3 -0
  451. transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
  452. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  453. transformers/models/perceiver/modeling_perceiver.py +5 -1
  454. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  455. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  456. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  457. transformers/models/persimmon/modeling_persimmon.py +1 -1
  458. transformers/models/phi/modeling_phi.py +1 -1
  459. transformers/models/phi3/modeling_phi3.py +1 -1
  460. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
  461. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
  462. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  463. transformers/models/phimoe/modeling_phimoe.py +12 -4
  464. transformers/models/phimoe/modular_phimoe.py +1 -1
  465. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  466. transformers/models/pixio/__init__.py +30 -0
  467. transformers/models/pixio/configuration_pixio.py +151 -0
  468. transformers/models/pixio/modeling_pixio.py +507 -0
  469. transformers/models/pixio/modular_pixio.py +404 -0
  470. transformers/models/pixtral/modeling_pixtral.py +1 -1
  471. transformers/models/pixtral/processing_pixtral.py +3 -1
  472. transformers/models/plbart/configuration_plbart.py +1 -0
  473. transformers/models/plbart/modeling_plbart.py +7 -0
  474. transformers/models/plbart/modular_plbart.py +6 -0
  475. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  476. transformers/models/poolformer/modeling_poolformer.py +11 -1
  477. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  478. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  479. transformers/models/prophetnet/modeling_prophetnet.py +2 -1
  480. transformers/models/qwen2/modeling_qwen2.py +1 -1
  481. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
  482. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
  483. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
  484. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
  485. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
  486. transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
  487. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  488. transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
  489. transformers/models/qwen3/modeling_qwen3.py +1 -1
  490. transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
  491. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
  492. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  493. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
  494. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
  495. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  496. transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
  497. transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
  498. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  499. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
  500. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
  501. transformers/models/rag/configuration_rag.py +0 -8
  502. transformers/models/rag/modeling_rag.py +7 -9
  503. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
  504. transformers/models/reformer/modeling_reformer.py +9 -1
  505. transformers/models/regnet/modeling_regnet.py +4 -0
  506. transformers/models/rembert/modeling_rembert.py +7 -1
  507. transformers/models/resnet/modeling_resnet.py +8 -3
  508. transformers/models/roberta/modeling_roberta.py +3 -0
  509. transformers/models/roberta/modular_roberta.py +3 -0
  510. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  511. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  512. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  513. transformers/models/rt_detr/modeling_rt_detr.py +4 -0
  514. transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
  515. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  516. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
  517. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  518. transformers/models/rwkv/modeling_rwkv.py +1 -1
  519. transformers/models/sam/configuration_sam.py +1 -0
  520. transformers/models/sam/image_processing_sam_fast.py +0 -1
  521. transformers/models/sam/modeling_sam.py +4 -1
  522. transformers/models/sam2/configuration_sam2.py +1 -1
  523. transformers/models/sam2/modeling_sam2.py +5 -1
  524. transformers/models/sam2/modular_sam2.py +5 -1
  525. transformers/models/sam2_video/modeling_sam2_video.py +51 -43
  526. transformers/models/sam2_video/modular_sam2_video.py +31 -18
  527. transformers/models/sam3/configuration_sam3.py +21 -1
  528. transformers/models/sam3/modeling_sam3.py +23 -0
  529. transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
  530. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
  531. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  532. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
  533. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  534. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  535. transformers/models/sam3_video/modeling_sam3_video.py +3 -3
  536. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  537. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  538. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  539. transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
  540. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
  541. transformers/models/seed_oss/modeling_seed_oss.py +1 -1
  542. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  543. transformers/models/segformer/modeling_segformer.py +2 -2
  544. transformers/models/segformer/modular_segformer.py +0 -1
  545. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  546. transformers/models/siglip/modeling_siglip.py +24 -2
  547. transformers/models/siglip2/modeling_siglip2.py +63 -41
  548. transformers/models/smollm3/modeling_smollm3.py +1 -1
  549. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  550. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  551. transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
  552. transformers/models/speecht5/modeling_speecht5.py +28 -0
  553. transformers/models/splinter/modeling_splinter.py +9 -3
  554. transformers/models/squeezebert/modeling_squeezebert.py +2 -0
  555. transformers/models/stablelm/modeling_stablelm.py +1 -1
  556. transformers/models/starcoder2/modeling_starcoder2.py +1 -1
  557. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  558. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  559. transformers/models/swiftformer/modeling_swiftformer.py +4 -0
  560. transformers/models/swin/modeling_swin.py +16 -12
  561. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  562. transformers/models/swin2sr/modeling_swin2sr.py +49 -33
  563. transformers/models/swinv2/modeling_swinv2.py +41 -33
  564. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  565. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  566. transformers/models/t5/configuration_t5.py +7 -1
  567. transformers/models/t5/modeling_t5.py +1 -7
  568. transformers/models/t5gemma/modeling_t5gemma.py +1 -1
  569. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  570. transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
  571. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  572. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  573. transformers/models/table_transformer/modeling_table_transformer.py +1 -1
  574. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  575. transformers/models/timesfm/modeling_timesfm.py +12 -0
  576. transformers/models/timesfm/modular_timesfm.py +12 -0
  577. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  578. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  579. transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
  580. transformers/models/trocr/modeling_trocr.py +1 -2
  581. transformers/models/tvp/configuration_tvp.py +5 -1
  582. transformers/models/tvp/modeling_tvp.py +4 -4
  583. transformers/models/udop/configuration_udop.py +1 -0
  584. transformers/models/udop/modeling_udop.py +3 -7
  585. transformers/models/umt5/configuration_umt5.py +2 -2
  586. transformers/models/umt5/modeling_umt5.py +0 -6
  587. transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
  588. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  589. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  590. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  591. transformers/models/video_llava/modeling_video_llava.py +7 -3
  592. transformers/models/vilt/configuration_vilt.py +2 -2
  593. transformers/models/vilt/modeling_vilt.py +7 -0
  594. transformers/models/vipllava/modeling_vipllava.py +7 -3
  595. transformers/models/visual_bert/modeling_visual_bert.py +2 -0
  596. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  597. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  598. transformers/models/vitmatte/modeling_vitmatte.py +4 -0
  599. transformers/models/vitpose/configuration_vitpose.py +1 -1
  600. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  601. transformers/models/voxtral/modeling_voxtral.py +2 -2
  602. transformers/models/voxtral/modular_voxtral.py +2 -2
  603. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
  604. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
  605. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
  606. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  607. transformers/models/whisper/generation_whisper.py +1 -0
  608. transformers/models/whisper/modeling_whisper.py +5 -3
  609. transformers/models/x_clip/modeling_x_clip.py +2 -0
  610. transformers/models/xcodec/modeling_xcodec.py +5 -0
  611. transformers/models/xglm/modeling_xglm.py +10 -0
  612. transformers/models/xlm/modeling_xlm.py +13 -14
  613. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  614. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  615. transformers/models/xlnet/modeling_xlnet.py +3 -1
  616. transformers/models/xmod/modeling_xmod.py +3 -0
  617. transformers/models/yoso/modeling_yoso.py +4 -1
  618. transformers/models/zamba/modeling_zamba.py +2 -1
  619. transformers/models/zamba2/modeling_zamba2.py +3 -2
  620. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  621. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  622. transformers/models/zoedepth/modeling_zoedepth.py +7 -0
  623. transformers/pipelines/__init__.py +9 -6
  624. transformers/pipelines/automatic_speech_recognition.py +20 -12
  625. transformers/pipelines/base.py +1 -1
  626. transformers/pipelines/document_question_answering.py +1 -1
  627. transformers/pipelines/question_answering.py +1 -1
  628. transformers/pipelines/text_to_audio.py +2 -2
  629. transformers/processing_utils.py +127 -56
  630. transformers/quantizers/auto.py +2 -4
  631. transformers/quantizers/base.py +9 -64
  632. transformers/quantizers/quantizer_aqlm.py +1 -18
  633. transformers/quantizers/quantizer_auto_round.py +1 -10
  634. transformers/quantizers/quantizer_awq.py +3 -8
  635. transformers/quantizers/quantizer_bitnet.py +1 -6
  636. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  637. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  638. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  639. transformers/quantizers/quantizer_eetq.py +2 -12
  640. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  641. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  642. transformers/quantizers/quantizer_fp_quant.py +4 -4
  643. transformers/quantizers/quantizer_gptq.py +1 -4
  644. transformers/quantizers/quantizer_higgs.py +2 -6
  645. transformers/quantizers/quantizer_mxfp4.py +2 -28
  646. transformers/quantizers/quantizer_quanto.py +14 -14
  647. transformers/quantizers/quantizer_spqr.py +3 -8
  648. transformers/quantizers/quantizer_torchao.py +28 -124
  649. transformers/quantizers/quantizer_vptq.py +1 -10
  650. transformers/testing_utils.py +28 -12
  651. transformers/tokenization_mistral_common.py +3 -2
  652. transformers/tokenization_utils_base.py +3 -2
  653. transformers/tokenization_utils_tokenizers.py +25 -2
  654. transformers/trainer.py +24 -2
  655. transformers/trainer_callback.py +8 -0
  656. transformers/trainer_seq2seq.py +4 -0
  657. transformers/training_args.py +8 -10
  658. transformers/utils/__init__.py +4 -0
  659. transformers/utils/attention_visualizer.py +4 -4
  660. transformers/utils/auto_docstring.py +34 -25
  661. transformers/utils/generic.py +20 -0
  662. transformers/utils/import_utils.py +51 -9
  663. transformers/utils/kernel_config.py +71 -18
  664. transformers/utils/quantization_config.py +8 -8
  665. transformers/video_processing_utils.py +16 -12
  666. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
  667. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
  668. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
  669. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  670. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
  671. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -188,7 +188,7 @@ class Llama4TextRotaryEmbedding(nn.Module):
188
188
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
189
189
 
190
190
  self.register_buffer("inv_freq", inv_freq, persistent=False)
191
- self.original_inv_freq = inv_freq
191
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
192
192
 
193
193
  @staticmethod
194
194
  def compute_default_rope_parameters(
@@ -1387,6 +1387,7 @@ class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
1387
1387
  attention_mask=None,
1388
1388
  cache_position=None,
1389
1389
  logits_to_keep=None,
1390
+ is_first_iteration=False,
1390
1391
  **kwargs,
1391
1392
  ):
1392
1393
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -1398,12 +1399,15 @@ class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
1398
1399
  attention_mask=attention_mask,
1399
1400
  cache_position=cache_position,
1400
1401
  logits_to_keep=logits_to_keep,
1402
+ is_first_iteration=is_first_iteration,
1401
1403
  **kwargs,
1402
1404
  )
1403
1405
 
1404
- if cache_position[0] == 0:
1405
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
1406
- # Otherwise we need pixel values to be passed to model
1406
+ if is_first_iteration or not kwargs.get("use_cache", True):
1407
+ # Pixel values are used only in the first iteration if available
1408
+ # In subsquent iterations, they are already merged with text and cached
1409
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
1410
+ # iteration with a question and cached system prompt (continue generate from cache)
1407
1411
  model_inputs["pixel_values"] = pixel_values
1408
1412
 
1409
1413
  return model_inputs
@@ -149,7 +149,6 @@ class LlavaImageProcessorFast(BaseImageProcessorFast):
149
149
  processed_images_grouped[shape] = stacked_images
150
150
 
151
151
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
152
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
153
152
 
154
153
  return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
155
154
 
@@ -202,10 +202,11 @@ class LlavaModel(LlavaPreTrainedModel):
202
202
  image_features = self.multi_modal_projector(selected_image_feature)
203
203
 
204
204
  if "image_sizes" in kwargs:
205
- split_sizes = [
206
- (height // self.vision_tower.patch_size) * (width // self.vision_tower.patch_size)
207
- for height, width in kwargs["image_sizes"]
208
- ]
205
+ split_sizes = (
206
+ (torch.as_tensor(kwargs["image_sizes"], device=image_features.device) // self.vision_tower.patch_size)
207
+ .prod(dim=-1)
208
+ .tolist()
209
+ )
209
210
  image_features = torch.split(image_features.squeeze(0), split_sizes)
210
211
  else:
211
212
  image_features = list(image_features)
@@ -437,6 +438,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
437
438
  attention_mask=None,
438
439
  cache_position=None,
439
440
  logits_to_keep=None,
441
+ is_first_iteration=False,
440
442
  **kwargs,
441
443
  ):
442
444
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -448,12 +450,15 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
448
450
  attention_mask=attention_mask,
449
451
  cache_position=cache_position,
450
452
  logits_to_keep=logits_to_keep,
453
+ is_first_iteration=is_first_iteration,
451
454
  **kwargs,
452
455
  )
453
456
 
454
- if cache_position[0] == 0:
455
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
456
- # Otherwise we need pixel values to be passed to model
457
+ if is_first_iteration or not kwargs.get("use_cache", True):
458
+ # Pixel values are used only in the first iteration if available
459
+ # In subsquent iterations, they are already merged with text and cached
460
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
461
+ # iteration with a question and cached system prompt (continue generate from cache)
457
462
  model_inputs["pixel_values"] = pixel_values
458
463
 
459
464
  return model_inputs
@@ -260,7 +260,6 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast):
260
260
 
261
261
  if do_pad:
262
262
  processed_images = self._pad_for_batching(processed_images)
263
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
264
263
  return BatchFeature(
265
264
  data={"pixel_values": processed_images, "image_sizes": image_sizes}, tensor_type=return_tensors
266
265
  )
@@ -692,6 +692,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
692
692
  attention_mask=None,
693
693
  cache_position=None,
694
694
  logits_to_keep=None,
695
+ is_first_iteration=False,
695
696
  **kwargs,
696
697
  ):
697
698
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -703,12 +704,15 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
703
704
  attention_mask=attention_mask,
704
705
  cache_position=cache_position,
705
706
  logits_to_keep=logits_to_keep,
707
+ is_first_iteration=is_first_iteration,
706
708
  **kwargs,
707
709
  )
708
710
 
709
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
710
- # Otherwise we need pixel values to be passed to model
711
- if cache_position[0] == 0:
711
+ # Pixel values are used only in the first iteration if available
712
+ # In subsquent iterations, they are already merged with text and cached
713
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
714
+ # iteration with a question and cached system prompt (continue generate from cache)
715
+ if is_first_iteration or not kwargs.get("use_cache", True):
712
716
  model_inputs["pixel_values"] = pixel_values
713
717
  model_inputs["image_sizes"] = image_sizes
714
718
 
@@ -868,6 +868,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
868
868
  attention_mask=None,
869
869
  cache_position=None,
870
870
  logits_to_keep=None,
871
+ is_first_iteration=False,
871
872
  **kwargs,
872
873
  ):
873
874
  # Overwritten -- extra custom processing
@@ -879,12 +880,15 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene
879
880
  attention_mask=attention_mask,
880
881
  cache_position=cache_position,
881
882
  logits_to_keep=logits_to_keep,
883
+ is_first_iteration=is_first_iteration,
882
884
  **kwargs,
883
885
  )
884
886
 
885
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
886
- # Otherwise we need pixel values to be passed to model
887
- if cache_position[0] == 0:
887
+ # Pixel values are used only in the first iteration if available
888
+ # In subsquent iterations, they are already merged with text and cached
889
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
890
+ # iteration with a question and cached system prompt (continue generate from cache)
891
+ if is_first_iteration or not kwargs.get("use_cache", True):
888
892
  model_inputs["pixel_values"] = pixel_values
889
893
  model_inputs["pixel_values_videos"] = pixel_values_videos
890
894
  model_inputs["image_sizes"] = image_sizes
@@ -693,6 +693,7 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
693
693
  attention_mask=None,
694
694
  cache_position=None,
695
695
  logits_to_keep=None,
696
+ is_first_iteration=False,
696
697
  **kwargs,
697
698
  ):
698
699
  # Overwritten -- extra custom processing
@@ -704,12 +705,15 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
704
705
  attention_mask=attention_mask,
705
706
  cache_position=cache_position,
706
707
  logits_to_keep=logits_to_keep,
708
+ is_first_iteration=is_first_iteration,
707
709
  **kwargs,
708
710
  )
709
711
 
710
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
711
- # Otherwise we need pixel values to be passed to model
712
- if cache_position[0] == 0:
712
+ # Pixel values are used only in the first iteration if available
713
+ # In subsquent iterations, they are already merged with text and cached
714
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
715
+ # iteration with a question and cached system prompt (continue generate from cache)
716
+ if is_first_iteration or not kwargs.get("use_cache", True):
713
717
  model_inputs["pixel_values"] = pixel_values
714
718
  model_inputs["pixel_values_videos"] = pixel_values_videos
715
719
  model_inputs["image_sizes"] = image_sizes
@@ -279,7 +279,6 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
279
279
 
280
280
  if do_pad:
281
281
  processed_images = self._pad_for_batching(processed_images)
282
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
283
282
  return BatchFeature(
284
283
  data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
285
284
  tensor_type=return_tensors,
@@ -846,6 +846,7 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene
846
846
  attention_mask=None,
847
847
  cache_position=None,
848
848
  logits_to_keep=None,
849
+ is_first_iteration=False,
849
850
  **kwargs,
850
851
  ):
851
852
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -857,12 +858,15 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene
857
858
  attention_mask=attention_mask,
858
859
  cache_position=cache_position,
859
860
  logits_to_keep=logits_to_keep,
861
+ is_first_iteration=is_first_iteration,
860
862
  **kwargs,
861
863
  )
862
864
 
863
- if cache_position[0] == 0:
864
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
865
- # Otherwise we need pixel values to be passed to model
865
+ if is_first_iteration or not kwargs.get("use_cache", True):
866
+ # Pixel values are used only in the first iteration if available
867
+ # In subsquent iterations, they are already merged with text and cached
868
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
869
+ # iteration with a question and cached system prompt (continue generate from cache)
866
870
  model_inputs["pixel_values"] = pixel_values
867
871
  model_inputs["image_sizes"] = image_sizes
868
872
  model_inputs["pixel_values_videos"] = pixel_values_videos
@@ -211,7 +211,6 @@ class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast):
211
211
 
212
212
  if do_pad:
213
213
  processed_images = self._pad_for_batching(processed_images)
214
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
215
214
  return BatchFeature(
216
215
  data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
217
216
  tensor_type=return_tensors,
@@ -698,6 +697,7 @@ class LlavaOnevisionForConditionalGeneration(LlavaNextVideoForConditionalGenerat
698
697
  attention_mask=None,
699
698
  cache_position=None,
700
699
  logits_to_keep=None,
700
+ is_first_iteration=False,
701
701
  **kwargs,
702
702
  ):
703
703
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -709,12 +709,15 @@ class LlavaOnevisionForConditionalGeneration(LlavaNextVideoForConditionalGenerat
709
709
  attention_mask=attention_mask,
710
710
  cache_position=cache_position,
711
711
  logits_to_keep=logits_to_keep,
712
+ is_first_iteration=is_first_iteration,
712
713
  **kwargs,
713
714
  )
714
715
 
715
- if cache_position[0] == 0:
716
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
717
- # Otherwise we need pixel values to be passed to model
716
+ if is_first_iteration or not kwargs.get("use_cache", True):
717
+ # Pixel values are used only in the first iteration if available
718
+ # In subsquent iterations, they are already merged with text and cached
719
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
720
+ # iteration with a question and cached system prompt (continue generate from cache)
718
721
  model_inputs["pixel_values"] = pixel_values
719
722
  model_inputs["image_sizes"] = image_sizes
720
723
  model_inputs["pixel_values_videos"] = pixel_values_videos
@@ -82,7 +82,7 @@ class LongcatFlashRotaryEmbedding(nn.Module):
82
82
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
83
83
 
84
84
  self.register_buffer("inv_freq", inv_freq, persistent=False)
85
- self.original_inv_freq = inv_freq
85
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
86
86
 
87
87
  @staticmethod
88
88
  def compute_default_rope_parameters(
@@ -563,6 +563,7 @@ class LongcatFlashPreTrainedModel(PreTrainedModel):
563
563
  super()._init_weights(module)
564
564
  if isinstance(module, LongcatFlashTopkRouter):
565
565
  init.normal_(module.classifier.weight, mean=0.0, std=self.config.initializer_range)
566
+ init.zeros_(module.e_score_correction_bias)
566
567
  if isinstance(module, LongcatFlashExperts):
567
568
  if module.gate_up_proj is not None:
568
569
  init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
@@ -347,6 +347,7 @@ class LongcatFlashPreTrainedModel(PreTrainedModel):
347
347
  super()._init_weights(module)
348
348
  if isinstance(module, LongcatFlashTopkRouter):
349
349
  init.normal_(module.classifier.weight, mean=0.0, std=self.config.initializer_range)
350
+ init.zeros_(module.e_score_correction_bias)
350
351
  if isinstance(module, LongcatFlashExperts):
351
352
  if module.gate_up_proj is not None:
352
353
  init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
@@ -1583,12 +1583,10 @@ class LongT5Model(LongT5PreTrainedModel):
1583
1583
  encoder_config = copy.deepcopy(config)
1584
1584
  encoder_config.is_decoder = False
1585
1585
  encoder_config.use_cache = False
1586
- encoder_config.tie_encoder_decoder = False
1587
1586
  self.encoder = LongT5Stack(encoder_config)
1588
1587
 
1589
1588
  decoder_config = copy.deepcopy(config)
1590
1589
  decoder_config.is_decoder = True
1591
- decoder_config.tie_encoder_decoder = False
1592
1590
  decoder_config.num_layers = config.num_decoder_layers
1593
1591
  self.decoder = LongT5Stack(decoder_config)
1594
1592
 
@@ -1746,12 +1744,10 @@ class LongT5ForConditionalGeneration(LongT5PreTrainedModel, GenerationMixin):
1746
1744
  encoder_config = copy.deepcopy(config)
1747
1745
  encoder_config.is_decoder = False
1748
1746
  encoder_config.use_cache = False
1749
- encoder_config.tie_encoder_decoder = False
1750
1747
  self.encoder = LongT5Stack(encoder_config)
1751
1748
 
1752
1749
  decoder_config = copy.deepcopy(config)
1753
1750
  decoder_config.is_decoder = True
1754
- decoder_config.tie_encoder_decoder = False
1755
1751
  decoder_config.num_layers = config.num_decoder_layers
1756
1752
  self.decoder = LongT5Stack(decoder_config)
1757
1753
 
@@ -22,6 +22,7 @@ import torch
22
22
  from torch import nn
23
23
  from torch.nn import CrossEntropyLoss
24
24
 
25
+ from ... import initialization as init
25
26
  from ...activations import ACT2FN
26
27
  from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
27
28
  from ...generation import GenerationMixin
@@ -84,6 +85,7 @@ class M2M100SinusoidalPositionalEmbedding(nn.Module):
84
85
  def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
85
86
  super().__init__()
86
87
  self.offset = 2
88
+ self.num_positions = num_positions
87
89
  self.embedding_dim = embedding_dim
88
90
  self.padding_idx = padding_idx
89
91
  self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
@@ -515,6 +517,14 @@ class M2M100PreTrainedModel(PreTrainedModel):
515
517
  # Doesn't support `compile` (dynamic control flow). Can be fixed but low usage model
516
518
  _can_compile_fullgraph = False
517
519
 
520
+ def _init_weights(self, module):
521
+ super()._init_weights(module)
522
+ if isinstance(module, M2M100SinusoidalPositionalEmbedding):
523
+ emb_weights = module.get_embedding(
524
+ module.num_positions + module.offset, module.embedding_dim, module.padding_idx
525
+ )
526
+ init.copy_(module.weights, emb_weights)
527
+
518
528
 
519
529
  class M2M100Encoder(M2M100PreTrainedModel):
520
530
  """
@@ -26,7 +26,7 @@ from ... import initialization as init
26
26
  from ...activations import ACT2FN
27
27
  from ...configuration_utils import PreTrainedConfig
28
28
  from ...generation import GenerationMixin
29
- from ...integrations.hub_kernels import lazy_load_kernel
29
+ from ...integrations import lazy_load_kernel
30
30
  from ...modeling_layers import GradientCheckpointingLayer
31
31
  from ...modeling_utils import PreTrainedModel
32
32
  from ...utils import (
@@ -750,6 +750,7 @@ class MambaForCausalLM(MambaPreTrainedModel, GenerationMixin):
750
750
  cache_params: Optional[MambaCache] = None,
751
751
  cache_position: Optional[torch.LongTensor] = None,
752
752
  attention_mask: Optional[torch.LongTensor] = None,
753
+ is_first_iteration: Optional[bool] = False,
753
754
  **kwargs,
754
755
  ):
755
756
  # Overwritten -- uses `cache_params` as opposed to `past_key_values`
@@ -24,6 +24,7 @@ from torch import nn
24
24
  from ... import initialization as init
25
25
  from ...activations import ACT2FN
26
26
  from ...generation import GenerationMixin
27
+ from ...integrations import lazy_load_kernel
27
28
  from ...modeling_layers import GradientCheckpointingLayer
28
29
  from ...modeling_utils import PreTrainedModel
29
30
  from ...utils import (
@@ -31,35 +32,12 @@ from ...utils import (
31
32
  auto_docstring,
32
33
  logging,
33
34
  )
34
- from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
35
35
  from .configuration_mamba2 import Mamba2Config
36
36
 
37
37
 
38
38
  logger = logging.get_logger(__name__)
39
39
 
40
40
 
41
- if is_mamba_2_ssm_available():
42
- from mamba_ssm.ops.triton.selective_state_update import selective_state_update
43
- from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
44
- else:
45
- mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined, selective_state_update = None, None, None
46
-
47
- if is_causal_conv1d_available():
48
- from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
49
- else:
50
- causal_conv1d_update, causal_conv1d_fn = None, None
51
-
52
- is_fast_path_available = all(
53
- (
54
- selective_state_update,
55
- mamba_chunk_scan_combined,
56
- mamba_split_conv1d_scan_combined,
57
- causal_conv1d_fn,
58
- causal_conv1d_update,
59
- )
60
- )
61
-
62
-
63
41
  # Helper methods for segment sum computation
64
42
 
65
43
 
@@ -286,6 +264,28 @@ class Mamba2Mixer(nn.Module):
286
264
  self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
287
265
  self.use_bias = config.use_bias
288
266
 
267
+ global causal_conv1d_update, causal_conv1d_fn
268
+ causal_conv1d = lazy_load_kernel("causal-conv1d")
269
+ causal_conv1d_update = getattr(causal_conv1d, "causal_conv1d_update", None)
270
+ causal_conv1d_fn = getattr(causal_conv1d, "causal_conv1d_fn", None)
271
+
272
+ global selective_state_update, mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
273
+ mamba_ssm = lazy_load_kernel("mamba-ssm")
274
+ selective_state_update = getattr(mamba_ssm, "selective_state_update", None)
275
+ mamba_chunk_scan_combined = getattr(mamba_ssm, "mamba_chunk_scan_combined", None)
276
+ mamba_split_conv1d_scan_combined = getattr(mamba_ssm, "mamba_split_conv1d_scan_combined", None)
277
+
278
+ global is_fast_path_available
279
+ is_fast_path_available = all(
280
+ (
281
+ selective_state_update,
282
+ mamba_chunk_scan_combined,
283
+ mamba_split_conv1d_scan_combined,
284
+ causal_conv1d_fn,
285
+ causal_conv1d_update,
286
+ )
287
+ )
288
+
289
289
  if not is_fast_path_available:
290
290
  logger.warning_once(
291
291
  "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
@@ -955,6 +955,7 @@ class Mamba2ForCausalLM(Mamba2PreTrainedModel, GenerationMixin):
955
955
  cache_params: Optional[Mamba2Cache] = None,
956
956
  cache_position: Optional[torch.LongTensor] = None,
957
957
  attention_mask: Optional[torch.Tensor] = None,
958
+ is_first_iteration: Optional[bool] = False,
958
959
  **kwargs,
959
960
  ):
960
961
  # Overwritten -- uses `cache_params` as opposed to `past_key_values`
@@ -147,7 +147,7 @@ class MarianConfig(PreTrainedConfig):
147
147
  self.num_hidden_layers = encoder_layers
148
148
  self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
149
149
  self.share_encoder_decoder_embeddings = share_encoder_decoder_embeddings
150
- kwargs["tie_encoder_decoder"] = share_encoder_decoder_embeddings
150
+ kwargs["tie_word_embeddings"] = share_encoder_decoder_embeddings
151
151
  super().__init__(
152
152
  pad_token_id=pad_token_id,
153
153
  eos_token_id=eos_token_id,
@@ -451,6 +451,8 @@ class MarianPreTrainedModel(PreTrainedModel):
451
451
  super()._init_weights(module)
452
452
  if isinstance(module, MarianSinusoidalPositionalEmbedding):
453
453
  init.copy_(module.weight, module.create_weight())
454
+ elif isinstance(module, MarianMTModel):
455
+ init.zeros_(module.final_logits_bias)
454
456
 
455
457
  @property
456
458
  def dummy_inputs(self):
@@ -1248,6 +1250,7 @@ class MarianDecoderWrapper(MarianPreTrainedModel):
1248
1250
  def __init__(self, config):
1249
1251
  super().__init__(config)
1250
1252
  self.decoder = MarianDecoder(config)
1253
+ self.post_init()
1251
1254
 
1252
1255
  def forward(self, *args, **kwargs):
1253
1256
  return self.decoder(*args, **kwargs)
@@ -14,7 +14,6 @@
14
14
  # limitations under the License.
15
15
  """PyTorch MarkupLM model."""
16
16
 
17
- import os
18
17
  from collections.abc import Callable
19
18
  from typing import Optional, Union
20
19
 
@@ -486,9 +485,9 @@ class MarkupLMEncoder(nn.Module):
486
485
  all_hidden_states = all_hidden_states + (hidden_states,)
487
486
 
488
487
  layer_outputs = layer_module(
489
- hidden_states=hidden_states,
490
- attention_mask=attention_mask,
491
- output_attentions=output_attentions,
488
+ hidden_states,
489
+ attention_mask,
490
+ output_attentions,
492
491
  **kwargs,
493
492
  )
494
493
 
@@ -517,10 +516,8 @@ class MarkupLMPreTrainedModel(PreTrainedModel):
517
516
  super()._init_weights(module)
518
517
  if isinstance(module, MarkupLMLMPredictionHead):
519
518
  init.zeros_(module.bias)
520
-
521
- @classmethod
522
- def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
523
- return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
519
+ elif isinstance(module, MarkupLMEmbeddings):
520
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
524
521
 
525
522
 
526
523
  @auto_docstring
@@ -14,7 +14,7 @@
14
14
  # limitations under the License.
15
15
  """Mask2Former model configuration"""
16
16
 
17
- from typing import Optional
17
+ from typing import Optional, Union
18
18
 
19
19
  from ...configuration_utils import PreTrainedConfig
20
20
  from ...utils import logging
@@ -39,7 +39,7 @@ class Mask2FormerConfig(PreTrainedConfig):
39
39
  Currently, Mask2Former only supports the [Swin Transformer](swin) as backbone.
40
40
 
41
41
  Args:
42
- backbone_config (`PreTrainedConfig` or `dict`, *optional*, defaults to `SwinConfig()`):
42
+ backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
43
43
  The configuration of the backbone model. If unset, the configuration corresponding to
44
44
  `swin-base-patch4-window12-384` will be used.
45
45
  backbone (`str`, *optional*):
@@ -134,7 +134,7 @@ class Mask2FormerConfig(PreTrainedConfig):
134
134
 
135
135
  def __init__(
136
136
  self,
137
- backbone_config: Optional[dict] = None,
137
+ backbone_config: Optional[Union[dict, PreTrainedConfig]] = None,
138
138
  feature_size: int = 256,
139
139
  mask_feature_size: int = 256,
140
140
  hidden_dim: int = 256,
@@ -387,10 +387,7 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
387
387
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
388
388
  processed_pixel_masks = reorder_images(processed_pixel_masks_grouped, grouped_images_index)
389
389
  encoded_inputs = BatchFeature(
390
- data={
391
- "pixel_values": torch.stack(processed_images, dim=0) if return_tensors else processed_images,
392
- "pixel_mask": torch.stack(processed_pixel_masks, dim=0) if return_tensors else processed_pixel_masks,
393
- },
390
+ data={"pixel_values": processed_images, "pixel_mask": processed_pixel_masks},
394
391
  tensor_type=return_tensors,
395
392
  )
396
393
  if segmentation_maps is not None:
@@ -2149,6 +2149,10 @@ class Mask2FormerPreTrainedModel(PreTrainedModel):
2149
2149
  init.normal_(module.weight, mean=0.0, std=std)
2150
2150
  if module.bias is not None:
2151
2151
  init.zeros_(module.bias)
2152
+ if getattr(module, "running_mean", None) is not None:
2153
+ init.zeros_(module.running_mean)
2154
+ init.ones_(module.running_var)
2155
+ init.zeros_(module.num_batches_tracked)
2152
2156
 
2153
2157
  elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
2154
2158
  init.ones_(module.weight)
@@ -2160,6 +2164,11 @@ class Mask2FormerPreTrainedModel(PreTrainedModel):
2160
2164
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
2161
2165
  init.zeros_(module.weight[module.padding_idx])
2162
2166
 
2167
+ elif isinstance(module, Mask2FormerLoss):
2168
+ empty_weight = torch.ones(module.num_labels + 1)
2169
+ empty_weight[-1] = module.eos_coef
2170
+ init.copy_(module.empty_weight, empty_weight)
2171
+
2163
2172
  if hasattr(module, "reference_points"):
2164
2173
  init.xavier_uniform_(module.reference_points.weight, gain=1.0)
2165
2174
  init.constant_(module.reference_points.bias, 0.0)
@@ -14,7 +14,7 @@
14
14
  # limitations under the License.
15
15
  """MaskFormer model configuration"""
16
16
 
17
- from typing import Optional
17
+ from typing import Optional, Union
18
18
 
19
19
  from ...configuration_utils import PreTrainedConfig
20
20
  from ...utils import logging
@@ -49,7 +49,7 @@ class MaskFormerConfig(PreTrainedConfig):
49
49
  use_auxiliary_loss(`bool`, *optional*, defaults to `False`):
50
50
  If `True` [`MaskFormerForInstanceSegmentationOutput`] will contain the auxiliary losses computed using the
51
51
  logits from each decoder's stage.
52
- backbone_config (`Dict`, *optional*):
52
+ backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `SwinConfig()`):
53
53
  The configuration passed to the backbone, if unset, the configuration corresponding to
54
54
  `swin-base-patch4-window12-384` will be used.
55
55
  backbone (`str`, *optional*):
@@ -114,7 +114,7 @@ class MaskFormerConfig(PreTrainedConfig):
114
114
  mask_feature_size: int = 256,
115
115
  no_object_weight: float = 0.1,
116
116
  use_auxiliary_loss: bool = False,
117
- backbone_config: Optional[dict] = None,
117
+ backbone_config: Optional[Union[dict, PreTrainedConfig]] = None,
118
118
  decoder_config: Optional[dict] = None,
119
119
  init_std: float = 0.02,
120
120
  init_xavier_std: float = 1.0,
@@ -391,10 +391,7 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
391
391
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
392
392
  processed_pixel_masks = reorder_images(processed_pixel_masks_grouped, grouped_images_index)
393
393
  encoded_inputs = BatchFeature(
394
- data={
395
- "pixel_values": torch.stack(processed_images, dim=0) if return_tensors else processed_images,
396
- "pixel_mask": torch.stack(processed_pixel_masks, dim=0) if return_tensors else processed_pixel_masks,
397
- },
394
+ data={"pixel_values": processed_images, "pixel_mask": processed_pixel_masks},
398
395
  tensor_type=return_tensors,
399
396
  )
400
397
  if segmentation_maps is not None:
@@ -174,7 +174,7 @@ class MaskFormerModelOutput(ModelOutput):
174
174
  custom_intro="""
175
175
  Class for outputs of [`MaskFormerForInstanceSegmentation`].
176
176
 
177
- This output can be directly passed to [`~MaskFormerImageProcessor.post_process_semantic_segmentation`] or or
177
+ This output can be directly passed to [`~MaskFormerImageProcessor.post_process_semantic_segmentation`] or
178
178
  [`~MaskFormerImageProcessor.post_process_instance_segmentation`] or
179
179
  [`~MaskFormerImageProcessor.post_process_panoptic_segmentation`] depending on the task. Please, see
180
180
  [`~MaskFormerImageProcessor] for details regarding usage.
@@ -1470,11 +1470,19 @@ class MaskFormerPreTrainedModel(PreTrainedModel):
1470
1470
  init.normal_(module.weight, mean=0.0, std=std)
1471
1471
  if module.bias is not None:
1472
1472
  init.zeros_(module.bias)
1473
+ if getattr(module, "running_mean", None) is not None:
1474
+ init.zeros_(module.running_mean)
1475
+ init.ones_(module.running_var)
1476
+ init.zeros_(module.num_batches_tracked)
1473
1477
  elif isinstance(module, nn.Embedding):
1474
1478
  init.normal_(module.weight, mean=0.0, std=std)
1475
1479
  # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
1476
1480
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
1477
1481
  init.zeros_(module.weight[module.padding_idx])
1482
+ elif isinstance(module, MaskFormerLoss):
1483
+ empty_weight = torch.ones(module.num_labels + 1)
1484
+ empty_weight[-1] = module.eos_coef
1485
+ init.copy_(module.empty_weight, empty_weight)
1478
1486
 
1479
1487
 
1480
1488
  @auto_docstring