transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (671) hide show
  1. transformers/__init__.py +20 -1
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/configuration_utils.py +114 -70
  6. transformers/conversion_mapping.py +68 -5
  7. transformers/core_model_loading.py +201 -35
  8. transformers/dependency_versions_table.py +1 -1
  9. transformers/feature_extraction_utils.py +54 -22
  10. transformers/generation/candidate_generator.py +79 -31
  11. transformers/generation/configuration_utils.py +162 -122
  12. transformers/generation/continuous_batching/cache.py +47 -18
  13. transformers/generation/continuous_batching/cache_manager.py +131 -34
  14. transformers/generation/continuous_batching/continuous_api.py +101 -64
  15. transformers/generation/continuous_batching/requests.py +28 -1
  16. transformers/generation/continuous_batching/scheduler.py +11 -4
  17. transformers/generation/stopping_criteria.py +1 -1
  18. transformers/generation/utils.py +108 -110
  19. transformers/generation/watermarking.py +8 -5
  20. transformers/image_processing_base.py +2 -12
  21. transformers/image_processing_utils_fast.py +15 -4
  22. transformers/initialization.py +37 -0
  23. transformers/integrations/__init__.py +12 -0
  24. transformers/integrations/accelerate.py +44 -111
  25. transformers/integrations/aqlm.py +3 -5
  26. transformers/integrations/awq.py +2 -5
  27. transformers/integrations/bitnet.py +5 -8
  28. transformers/integrations/bitsandbytes.py +16 -15
  29. transformers/integrations/deepspeed.py +18 -3
  30. transformers/integrations/eetq.py +3 -5
  31. transformers/integrations/fbgemm_fp8.py +1 -1
  32. transformers/integrations/finegrained_fp8.py +6 -16
  33. transformers/integrations/flash_attention.py +2 -2
  34. transformers/integrations/higgs.py +2 -5
  35. transformers/integrations/hub_kernels.py +23 -5
  36. transformers/integrations/integration_utils.py +35 -0
  37. transformers/integrations/mistral.py +12 -0
  38. transformers/integrations/moe.py +240 -0
  39. transformers/integrations/mxfp4.py +4 -10
  40. transformers/integrations/peft.py +5 -0
  41. transformers/integrations/quanto.py +5 -2
  42. transformers/integrations/spqr.py +3 -5
  43. transformers/integrations/tensor_parallel.py +167 -221
  44. transformers/integrations/vptq.py +3 -5
  45. transformers/modeling_gguf_pytorch_utils.py +66 -19
  46. transformers/modeling_rope_utils.py +78 -81
  47. transformers/modeling_utils.py +583 -503
  48. transformers/models/__init__.py +19 -0
  49. transformers/models/afmoe/modeling_afmoe.py +7 -16
  50. transformers/models/afmoe/modular_afmoe.py +5 -13
  51. transformers/models/aimv2/modeling_aimv2.py +4 -0
  52. transformers/models/aimv2/modular_aimv2.py +4 -0
  53. transformers/models/albert/modeling_albert.py +3 -0
  54. transformers/models/align/modeling_align.py +12 -6
  55. transformers/models/altclip/modeling_altclip.py +7 -3
  56. transformers/models/apertus/modeling_apertus.py +4 -2
  57. transformers/models/apertus/modular_apertus.py +4 -1
  58. transformers/models/arcee/modeling_arcee.py +1 -1
  59. transformers/models/aria/modeling_aria.py +8 -4
  60. transformers/models/aria/modular_aria.py +7 -3
  61. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  62. transformers/models/auto/auto_factory.py +1 -1
  63. transformers/models/auto/configuration_auto.py +27 -0
  64. transformers/models/auto/feature_extraction_auto.py +7 -3
  65. transformers/models/auto/image_processing_auto.py +4 -2
  66. transformers/models/auto/modeling_auto.py +31 -0
  67. transformers/models/auto/processing_auto.py +4 -0
  68. transformers/models/auto/tokenization_auto.py +132 -153
  69. transformers/models/auto/video_processing_auto.py +5 -2
  70. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  71. transformers/models/bamba/modeling_bamba.py +18 -19
  72. transformers/models/bamba/modular_bamba.py +17 -16
  73. transformers/models/bark/modeling_bark.py +9 -0
  74. transformers/models/bart/configuration_bart.py +0 -1
  75. transformers/models/bart/modeling_bart.py +7 -0
  76. transformers/models/beit/image_processing_beit_fast.py +0 -1
  77. transformers/models/bert/modeling_bert.py +3 -0
  78. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  79. transformers/models/big_bird/modeling_big_bird.py +3 -0
  80. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +7 -0
  81. transformers/models/bit/modeling_bit.py +5 -1
  82. transformers/models/bitnet/modeling_bitnet.py +1 -1
  83. transformers/models/blenderbot/modeling_blenderbot.py +7 -0
  84. transformers/models/blenderbot/tokenization_blenderbot.py +6 -7
  85. transformers/models/blenderbot_small/modeling_blenderbot_small.py +7 -0
  86. transformers/models/blip/modeling_blip.py +2 -0
  87. transformers/models/blip/modeling_blip_text.py +8 -0
  88. transformers/models/blip_2/modeling_blip_2.py +2 -0
  89. transformers/models/bloom/modeling_bloom.py +13 -44
  90. transformers/models/blt/modeling_blt.py +162 -2
  91. transformers/models/blt/modular_blt.py +168 -3
  92. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  93. transformers/models/bridgetower/modeling_bridgetower.py +6 -0
  94. transformers/models/bros/modeling_bros.py +8 -0
  95. transformers/models/camembert/modeling_camembert.py +109 -106
  96. transformers/models/canine/modeling_canine.py +6 -0
  97. transformers/models/canine/tokenization_canine.py +2 -0
  98. transformers/models/chameleon/modeling_chameleon.py +9 -4
  99. transformers/models/chinese_clip/modeling_chinese_clip.py +6 -3
  100. transformers/models/clap/feature_extraction_clap.py +2 -2
  101. transformers/models/clap/modeling_clap.py +25 -15
  102. transformers/models/clip/modeling_clip.py +2 -0
  103. transformers/models/clipseg/modeling_clipseg.py +4 -0
  104. transformers/models/clvp/modeling_clvp.py +14 -3
  105. transformers/models/code_llama/tokenization_code_llama.py +1 -1
  106. transformers/models/codegen/modeling_codegen.py +13 -4
  107. transformers/models/cohere/modeling_cohere.py +1 -1
  108. transformers/models/cohere2/modeling_cohere2.py +1 -1
  109. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +0 -1
  110. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  111. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  112. transformers/models/conditional_detr/modeling_conditional_detr.py +4 -1
  113. transformers/models/convbert/modeling_convbert.py +3 -0
  114. transformers/models/convnext/image_processing_convnext.py +2 -2
  115. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  116. transformers/models/csm/generation_csm.py +19 -22
  117. transformers/models/csm/modeling_csm.py +3 -1
  118. transformers/models/csm/modular_csm.py +2 -0
  119. transformers/models/ctrl/modeling_ctrl.py +14 -2
  120. transformers/models/cvt/modeling_cvt.py +5 -1
  121. transformers/models/cwm/modeling_cwm.py +1 -1
  122. transformers/models/d_fine/configuration_d_fine.py +3 -4
  123. transformers/models/d_fine/modeling_d_fine.py +46 -39
  124. transformers/models/d_fine/modular_d_fine.py +15 -4
  125. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  126. transformers/models/dab_detr/modeling_dab_detr.py +1 -1
  127. transformers/models/dac/modeling_dac.py +4 -4
  128. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  129. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  130. transformers/models/dbrx/configuration_dbrx.py +9 -1
  131. transformers/models/dbrx/modeling_dbrx.py +1 -1
  132. transformers/models/deberta/modeling_deberta.py +2 -0
  133. transformers/models/deberta_v2/modeling_deberta_v2.py +2 -0
  134. transformers/models/decision_transformer/modeling_decision_transformer.py +8 -5
  135. transformers/models/deepseek_v2/modeling_deepseek_v2.py +7 -4
  136. transformers/models/deepseek_v2/modular_deepseek_v2.py +4 -2
  137. transformers/models/deepseek_v3/modeling_deepseek_v3.py +9 -5
  138. transformers/models/deepseek_v3/modular_deepseek_v3.py +6 -2
  139. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  140. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  141. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  142. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  143. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  144. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  145. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  146. transformers/models/deformable_detr/modeling_deformable_detr.py +1 -1
  147. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  148. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  149. transformers/models/detr/configuration_detr.py +1 -1
  150. transformers/models/detr/modeling_detr.py +8 -1
  151. transformers/models/dia/generation_dia.py +3 -10
  152. transformers/models/dia/modeling_dia.py +12 -1
  153. transformers/models/dia/modular_dia.py +11 -0
  154. transformers/models/dia/processing_dia.py +1 -1
  155. transformers/models/diffllama/modeling_diffllama.py +3 -3
  156. transformers/models/diffllama/modular_diffllama.py +2 -2
  157. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  158. transformers/models/dinov3_vit/modeling_dinov3_vit.py +3 -0
  159. transformers/models/dinov3_vit/modular_dinov3_vit.py +3 -0
  160. transformers/models/distilbert/modeling_distilbert.py +11 -9
  161. transformers/models/doge/modeling_doge.py +1 -1
  162. transformers/models/donut/image_processing_donut_fast.py +0 -1
  163. transformers/models/donut/modeling_donut_swin.py +16 -12
  164. transformers/models/dots1/modeling_dots1.py +14 -5
  165. transformers/models/dpt/configuration_dpt.py +1 -1
  166. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  167. transformers/models/dpt/modular_dpt.py +1 -2
  168. transformers/models/edgetam/configuration_edgetam.py +1 -1
  169. transformers/models/edgetam/modeling_edgetam.py +5 -2
  170. transformers/models/edgetam/modular_edgetam.py +15 -14
  171. transformers/models/edgetam_video/modeling_edgetam_video.py +55 -43
  172. transformers/models/edgetam_video/modular_edgetam_video.py +13 -19
  173. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  174. transformers/models/efficientloftr/modeling_efficientloftr.py +14 -1
  175. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  176. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  177. transformers/models/efficientnet/modeling_efficientnet.py +5 -1
  178. transformers/models/electra/modeling_electra.py +7 -0
  179. transformers/models/emu3/modeling_emu3.py +8 -2
  180. transformers/models/emu3/modular_emu3.py +7 -1
  181. transformers/models/encodec/modeling_encodec.py +14 -0
  182. transformers/models/eomt/image_processing_eomt_fast.py +46 -14
  183. transformers/models/eomt/modeling_eomt.py +7 -0
  184. transformers/models/eomt/modular_eomt.py +7 -0
  185. transformers/models/ernie/modeling_ernie.py +6 -0
  186. transformers/models/ernie/modular_ernie.py +6 -0
  187. transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
  188. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +16 -13
  189. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +9 -35
  190. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  191. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  192. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  193. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  194. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  195. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  196. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  197. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  198. transformers/models/esm/modeling_esm.py +6 -0
  199. transformers/models/esm/modeling_esmfold.py +6 -1
  200. transformers/models/evolla/modeling_evolla.py +9 -1
  201. transformers/models/evolla/modular_evolla.py +8 -0
  202. transformers/models/exaone4/modeling_exaone4.py +1 -1
  203. transformers/models/falcon/modeling_falcon.py +3 -3
  204. transformers/models/falcon_h1/modeling_falcon_h1.py +28 -23
  205. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  206. transformers/models/falcon_mamba/modeling_falcon_mamba.py +6 -2
  207. transformers/models/falcon_mamba/modular_falcon_mamba.py +7 -2
  208. transformers/models/fast_vlm/modeling_fast_vlm.py +7 -3
  209. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +23 -10
  210. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  211. transformers/models/flaubert/modeling_flaubert.py +14 -15
  212. transformers/models/flava/image_processing_flava_fast.py +0 -2
  213. transformers/models/flava/modeling_flava.py +4 -1
  214. transformers/models/flex_olmo/modeling_flex_olmo.py +7 -4
  215. transformers/models/florence2/modeling_florence2.py +20 -3
  216. transformers/models/florence2/modular_florence2.py +13 -0
  217. transformers/models/fnet/modeling_fnet.py +7 -0
  218. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  219. transformers/models/fuyu/modeling_fuyu.py +3 -1
  220. transformers/models/fuyu/processing_fuyu.py +16 -0
  221. transformers/models/gemma/modeling_gemma.py +10 -12
  222. transformers/models/gemma/modular_gemma.py +9 -11
  223. transformers/models/gemma2/modeling_gemma2.py +1 -1
  224. transformers/models/gemma2/modular_gemma2.py +1 -1
  225. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  226. transformers/models/gemma3/modeling_gemma3.py +28 -7
  227. transformers/models/gemma3/modular_gemma3.py +26 -6
  228. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  229. transformers/models/gemma3n/modeling_gemma3n.py +47 -9
  230. transformers/models/gemma3n/modular_gemma3n.py +51 -9
  231. transformers/models/git/modeling_git.py +181 -126
  232. transformers/models/glm/modeling_glm.py +1 -1
  233. transformers/models/glm4/modeling_glm4.py +1 -1
  234. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  235. transformers/models/glm46v/modeling_glm46v.py +3 -1
  236. transformers/models/glm46v/modular_glm46v.py +3 -0
  237. transformers/models/glm4_moe/modeling_glm4_moe.py +9 -5
  238. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  239. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  240. transformers/models/glm4v/modeling_glm4v.py +15 -5
  241. transformers/models/glm4v/modular_glm4v.py +11 -3
  242. transformers/models/glm4v_moe/modeling_glm4v_moe.py +39 -23
  243. transformers/models/glm4v_moe/modular_glm4v_moe.py +12 -0
  244. transformers/models/glmasr/__init__.py +30 -0
  245. transformers/models/glmasr/configuration_glmasr.py +197 -0
  246. transformers/models/glmasr/modeling_glmasr.py +512 -0
  247. transformers/models/glmasr/modular_glmasr.py +433 -0
  248. transformers/models/glmasr/processing_glmasr.py +332 -0
  249. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  250. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  251. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  252. transformers/models/gpt2/modeling_gpt2.py +8 -5
  253. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +3 -8
  254. transformers/models/gpt_neo/modeling_gpt_neo.py +15 -3
  255. transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
  256. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +1 -1
  257. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  258. transformers/models/gpt_oss/modeling_gpt_oss.py +6 -9
  259. transformers/models/gpt_oss/modular_gpt_oss.py +5 -7
  260. transformers/models/gptj/modeling_gptj.py +15 -6
  261. transformers/models/granite/modeling_granite.py +1 -1
  262. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  263. transformers/models/granitemoe/modeling_granitemoe.py +2 -3
  264. transformers/models/granitemoe/modular_granitemoe.py +1 -2
  265. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  266. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +33 -23
  267. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  268. transformers/models/granitemoeshared/modeling_granitemoeshared.py +2 -3
  269. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  270. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -4
  271. transformers/models/groupvit/modeling_groupvit.py +6 -1
  272. transformers/models/helium/modeling_helium.py +1 -1
  273. transformers/models/hgnet_v2/modeling_hgnet_v2.py +10 -0
  274. transformers/models/hgnet_v2/modular_hgnet_v2.py +10 -0
  275. transformers/models/hubert/modeling_hubert.py +4 -0
  276. transformers/models/hubert/modular_hubert.py +4 -0
  277. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
  278. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  279. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  280. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +12 -4
  281. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  282. transformers/models/ibert/modeling_ibert.py +16 -0
  283. transformers/models/idefics/modeling_idefics.py +10 -0
  284. transformers/models/idefics2/modeling_idefics2.py +7 -1
  285. transformers/models/idefics3/modeling_idefics3.py +5 -1
  286. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  287. transformers/models/imagegpt/modeling_imagegpt.py +9 -2
  288. transformers/models/instructblip/modeling_instructblip.py +2 -0
  289. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  290. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  291. transformers/models/internvl/modeling_internvl.py +11 -8
  292. transformers/models/internvl/modular_internvl.py +5 -9
  293. transformers/models/internvl/video_processing_internvl.py +0 -1
  294. transformers/models/jais2/__init__.py +27 -0
  295. transformers/models/jais2/configuration_jais2.py +152 -0
  296. transformers/models/jais2/modeling_jais2.py +486 -0
  297. transformers/models/jais2/modular_jais2.py +196 -0
  298. transformers/models/jamba/modeling_jamba.py +24 -19
  299. transformers/models/jamba/modular_jamba.py +17 -17
  300. transformers/models/janus/image_processing_janus_fast.py +0 -1
  301. transformers/models/janus/modeling_janus.py +15 -7
  302. transformers/models/janus/modular_janus.py +16 -7
  303. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  304. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  305. transformers/models/kosmos2/modeling_kosmos2.py +14 -2
  306. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  307. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  308. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +9 -3
  309. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  310. transformers/models/lasr/configuration_lasr.py +4 -0
  311. transformers/models/lasr/modeling_lasr.py +3 -2
  312. transformers/models/lasr/modular_lasr.py +8 -1
  313. transformers/models/lasr/processing_lasr.py +0 -2
  314. transformers/models/layoutlm/modeling_layoutlm.py +5 -3
  315. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  316. transformers/models/layoutlmv2/modeling_layoutlmv2.py +12 -0
  317. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +1 -0
  318. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  319. transformers/models/layoutlmv3/modeling_layoutlmv3.py +29 -5
  320. transformers/models/led/modeling_led.py +6 -0
  321. transformers/models/levit/modeling_levit.py +18 -0
  322. transformers/models/lfm2/modeling_lfm2.py +1 -1
  323. transformers/models/lfm2_moe/modeling_lfm2_moe.py +14 -4
  324. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  325. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  326. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  327. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  328. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  329. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  330. transformers/models/lilt/modeling_lilt.py +19 -15
  331. transformers/models/llama/modeling_llama.py +1 -1
  332. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  333. transformers/models/llama4/modeling_llama4.py +8 -4
  334. transformers/models/llava/image_processing_llava_fast.py +0 -1
  335. transformers/models/llava/modeling_llava.py +12 -7
  336. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  337. transformers/models/llava_next/modeling_llava_next.py +7 -3
  338. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  339. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  340. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  341. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  342. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  343. transformers/models/longcat_flash/modeling_longcat_flash.py +2 -1
  344. transformers/models/longcat_flash/modular_longcat_flash.py +1 -0
  345. transformers/models/longt5/modeling_longt5.py +0 -4
  346. transformers/models/m2m_100/modeling_m2m_100.py +10 -0
  347. transformers/models/mamba/modeling_mamba.py +2 -1
  348. transformers/models/mamba2/modeling_mamba2.py +24 -23
  349. transformers/models/marian/configuration_marian.py +1 -1
  350. transformers/models/marian/modeling_marian.py +3 -0
  351. transformers/models/markuplm/modeling_markuplm.py +5 -8
  352. transformers/models/mask2former/configuration_mask2former.py +3 -3
  353. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  354. transformers/models/mask2former/modeling_mask2former.py +9 -0
  355. transformers/models/maskformer/configuration_maskformer.py +3 -3
  356. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  357. transformers/models/maskformer/modeling_maskformer.py +9 -1
  358. transformers/models/maskformer/modeling_maskformer_swin.py +19 -15
  359. transformers/models/mbart/configuration_mbart.py +1 -0
  360. transformers/models/mbart/modeling_mbart.py +7 -0
  361. transformers/models/megatron_bert/modeling_megatron_bert.py +2 -0
  362. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  363. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  364. transformers/models/mimi/modeling_mimi.py +25 -4
  365. transformers/models/minimax/modeling_minimax.py +16 -3
  366. transformers/models/minimax/modular_minimax.py +12 -1
  367. transformers/models/ministral/modeling_ministral.py +1 -1
  368. transformers/models/ministral3/modeling_ministral3.py +1 -1
  369. transformers/models/mistral/modeling_mistral.py +1 -1
  370. transformers/models/mistral3/modeling_mistral3.py +10 -4
  371. transformers/models/mistral3/modular_mistral3.py +3 -1
  372. transformers/models/mixtral/modeling_mixtral.py +12 -4
  373. transformers/models/mixtral/modular_mixtral.py +6 -2
  374. transformers/models/mlcd/modeling_mlcd.py +6 -0
  375. transformers/models/mlcd/modular_mlcd.py +4 -0
  376. transformers/models/mllama/modeling_mllama.py +13 -2
  377. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  378. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -4
  379. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  380. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  381. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  382. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  383. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  384. transformers/models/mobilevit/modeling_mobilevit.py +4 -0
  385. transformers/models/mobilevitv2/modeling_mobilevitv2.py +4 -0
  386. transformers/models/modernbert/modeling_modernbert.py +12 -1
  387. transformers/models/modernbert/modular_modernbert.py +12 -1
  388. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +9 -1
  389. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +9 -1
  390. transformers/models/moonshine/modeling_moonshine.py +1 -1
  391. transformers/models/moshi/modeling_moshi.py +21 -51
  392. transformers/models/mpnet/modeling_mpnet.py +2 -0
  393. transformers/models/mra/modeling_mra.py +4 -1
  394. transformers/models/mt5/configuration_mt5.py +2 -3
  395. transformers/models/mt5/modeling_mt5.py +0 -10
  396. transformers/models/musicgen/modeling_musicgen.py +5 -9
  397. transformers/models/musicgen_melody/modeling_musicgen_melody.py +4 -0
  398. transformers/models/mvp/modeling_mvp.py +7 -0
  399. transformers/models/nanochat/modeling_nanochat.py +1 -1
  400. transformers/models/nemotron/modeling_nemotron.py +3 -3
  401. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  402. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  403. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  404. transformers/models/nougat/tokenization_nougat.py +11 -16
  405. transformers/models/nystromformer/modeling_nystromformer.py +7 -0
  406. transformers/models/olmo/modeling_olmo.py +1 -1
  407. transformers/models/olmo2/modeling_olmo2.py +1 -1
  408. transformers/models/olmo3/modeling_olmo3.py +1 -1
  409. transformers/models/olmoe/modeling_olmoe.py +12 -4
  410. transformers/models/olmoe/modular_olmoe.py +4 -2
  411. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  412. transformers/models/omdet_turbo/modeling_omdet_turbo.py +4 -0
  413. transformers/models/oneformer/configuration_oneformer.py +3 -3
  414. transformers/models/oneformer/modeling_oneformer.py +7 -38
  415. transformers/models/openai/modeling_openai.py +12 -0
  416. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  417. transformers/models/ovis2/modeling_ovis2.py +15 -3
  418. transformers/models/ovis2/modular_ovis2.py +8 -0
  419. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  420. transformers/models/owlv2/modeling_owlv2.py +7 -3
  421. transformers/models/owlv2/modular_owlv2.py +0 -2
  422. transformers/models/owlvit/modeling_owlvit.py +7 -3
  423. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +3 -2
  424. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +28 -14
  425. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +22 -12
  426. transformers/models/paligemma/modeling_paligemma.py +25 -17
  427. transformers/models/parakeet/modeling_parakeet.py +5 -0
  428. transformers/models/parakeet/modular_parakeet.py +5 -0
  429. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  430. transformers/models/patchtsmixer/modeling_patchtsmixer.py +4 -0
  431. transformers/models/patchtst/modeling_patchtst.py +5 -4
  432. transformers/models/pe_audio/__init__.py +30 -0
  433. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  434. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  435. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  436. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  437. transformers/models/pe_audio/processing_pe_audio.py +24 -0
  438. transformers/models/pe_audio_video/__init__.py +29 -0
  439. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  440. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  441. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  442. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  443. transformers/models/pe_video/__init__.py +30 -0
  444. transformers/models/pe_video/configuration_pe_video.py +211 -0
  445. transformers/models/pe_video/modeling_pe_video.py +636 -0
  446. transformers/models/pe_video/modular_pe_video.py +219 -0
  447. transformers/models/pe_video/processing_pe_video.py +10 -0
  448. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  449. transformers/models/pegasus/configuration_pegasus.py +1 -0
  450. transformers/models/pegasus/modeling_pegasus.py +3 -0
  451. transformers/models/pegasus_x/modeling_pegasus_x.py +1 -0
  452. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  453. transformers/models/perceiver/modeling_perceiver.py +5 -1
  454. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  455. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  456. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  457. transformers/models/persimmon/modeling_persimmon.py +1 -1
  458. transformers/models/phi/modeling_phi.py +1 -1
  459. transformers/models/phi3/modeling_phi3.py +1 -1
  460. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +4 -1
  461. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +3 -0
  462. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  463. transformers/models/phimoe/modeling_phimoe.py +12 -4
  464. transformers/models/phimoe/modular_phimoe.py +1 -1
  465. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  466. transformers/models/pixio/__init__.py +30 -0
  467. transformers/models/pixio/configuration_pixio.py +151 -0
  468. transformers/models/pixio/modeling_pixio.py +507 -0
  469. transformers/models/pixio/modular_pixio.py +404 -0
  470. transformers/models/pixtral/modeling_pixtral.py +1 -1
  471. transformers/models/pixtral/processing_pixtral.py +3 -1
  472. transformers/models/plbart/configuration_plbart.py +1 -0
  473. transformers/models/plbart/modeling_plbart.py +7 -0
  474. transformers/models/plbart/modular_plbart.py +6 -0
  475. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  476. transformers/models/poolformer/modeling_poolformer.py +11 -1
  477. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  478. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  479. transformers/models/prophetnet/modeling_prophetnet.py +2 -1
  480. transformers/models/qwen2/modeling_qwen2.py +1 -1
  481. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +104 -64
  482. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +58 -18
  483. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +18 -5
  484. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +26 -22
  485. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -2
  486. transformers/models/qwen2_moe/modeling_qwen2_moe.py +12 -4
  487. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  488. transformers/models/qwen2_vl/modeling_qwen2_vl.py +17 -4
  489. transformers/models/qwen3/modeling_qwen3.py +1 -1
  490. transformers/models/qwen3_moe/modeling_qwen3_moe.py +12 -4
  491. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -6
  492. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  493. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +92 -46
  494. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +48 -4
  495. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  496. transformers/models/qwen3_vl/modeling_qwen3_vl.py +17 -4
  497. transformers/models/qwen3_vl/modular_qwen3_vl.py +21 -10
  498. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  499. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +94 -112
  500. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +32 -81
  501. transformers/models/rag/configuration_rag.py +0 -8
  502. transformers/models/rag/modeling_rag.py +7 -9
  503. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +3 -2
  504. transformers/models/reformer/modeling_reformer.py +9 -1
  505. transformers/models/regnet/modeling_regnet.py +4 -0
  506. transformers/models/rembert/modeling_rembert.py +7 -1
  507. transformers/models/resnet/modeling_resnet.py +8 -3
  508. transformers/models/roberta/modeling_roberta.py +3 -0
  509. transformers/models/roberta/modular_roberta.py +3 -0
  510. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  511. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  512. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  513. transformers/models/rt_detr/modeling_rt_detr.py +4 -0
  514. transformers/models/rt_detr/modeling_rt_detr_resnet.py +8 -3
  515. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  516. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +7 -0
  517. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  518. transformers/models/rwkv/modeling_rwkv.py +1 -1
  519. transformers/models/sam/configuration_sam.py +1 -0
  520. transformers/models/sam/image_processing_sam_fast.py +0 -1
  521. transformers/models/sam/modeling_sam.py +4 -1
  522. transformers/models/sam2/configuration_sam2.py +1 -1
  523. transformers/models/sam2/modeling_sam2.py +5 -1
  524. transformers/models/sam2/modular_sam2.py +5 -1
  525. transformers/models/sam2_video/modeling_sam2_video.py +51 -43
  526. transformers/models/sam2_video/modular_sam2_video.py +31 -18
  527. transformers/models/sam3/configuration_sam3.py +21 -1
  528. transformers/models/sam3/modeling_sam3.py +23 -0
  529. transformers/models/sam3_tracker/modeling_sam3_tracker.py +2 -0
  530. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -0
  531. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  532. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +26 -15
  533. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  534. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  535. transformers/models/sam3_video/modeling_sam3_video.py +3 -3
  536. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  537. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  538. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  539. transformers/models/seamless_m4t/modeling_seamless_m4t.py +27 -11
  540. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +6 -0
  541. transformers/models/seed_oss/modeling_seed_oss.py +1 -1
  542. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  543. transformers/models/segformer/modeling_segformer.py +2 -2
  544. transformers/models/segformer/modular_segformer.py +0 -1
  545. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  546. transformers/models/siglip/modeling_siglip.py +24 -2
  547. transformers/models/siglip2/modeling_siglip2.py +63 -41
  548. transformers/models/smollm3/modeling_smollm3.py +1 -1
  549. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  550. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  551. transformers/models/speech_to_text/modeling_speech_to_text.py +10 -0
  552. transformers/models/speecht5/modeling_speecht5.py +28 -0
  553. transformers/models/splinter/modeling_splinter.py +9 -3
  554. transformers/models/squeezebert/modeling_squeezebert.py +2 -0
  555. transformers/models/stablelm/modeling_stablelm.py +1 -1
  556. transformers/models/starcoder2/modeling_starcoder2.py +1 -1
  557. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  558. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  559. transformers/models/swiftformer/modeling_swiftformer.py +4 -0
  560. transformers/models/swin/modeling_swin.py +16 -12
  561. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  562. transformers/models/swin2sr/modeling_swin2sr.py +49 -33
  563. transformers/models/swinv2/modeling_swinv2.py +41 -33
  564. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  565. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  566. transformers/models/t5/configuration_t5.py +7 -1
  567. transformers/models/t5/modeling_t5.py +1 -7
  568. transformers/models/t5gemma/modeling_t5gemma.py +1 -1
  569. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  570. transformers/models/t5gemma2/modeling_t5gemma2.py +13 -4
  571. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  572. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  573. transformers/models/table_transformer/modeling_table_transformer.py +1 -1
  574. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  575. transformers/models/timesfm/modeling_timesfm.py +12 -0
  576. transformers/models/timesfm/modular_timesfm.py +12 -0
  577. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  578. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  579. transformers/models/timm_wrapper/modeling_timm_wrapper.py +19 -13
  580. transformers/models/trocr/modeling_trocr.py +1 -2
  581. transformers/models/tvp/configuration_tvp.py +5 -1
  582. transformers/models/tvp/modeling_tvp.py +4 -4
  583. transformers/models/udop/configuration_udop.py +1 -0
  584. transformers/models/udop/modeling_udop.py +3 -7
  585. transformers/models/umt5/configuration_umt5.py +2 -2
  586. transformers/models/umt5/modeling_umt5.py +0 -6
  587. transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
  588. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  589. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  590. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  591. transformers/models/video_llava/modeling_video_llava.py +7 -3
  592. transformers/models/vilt/configuration_vilt.py +2 -2
  593. transformers/models/vilt/modeling_vilt.py +7 -0
  594. transformers/models/vipllava/modeling_vipllava.py +7 -3
  595. transformers/models/visual_bert/modeling_visual_bert.py +2 -0
  596. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  597. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  598. transformers/models/vitmatte/modeling_vitmatte.py +4 -0
  599. transformers/models/vitpose/configuration_vitpose.py +1 -1
  600. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  601. transformers/models/voxtral/modeling_voxtral.py +2 -2
  602. transformers/models/voxtral/modular_voxtral.py +2 -2
  603. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +16 -10
  604. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +7 -0
  605. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +21 -11
  606. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  607. transformers/models/whisper/generation_whisper.py +1 -0
  608. transformers/models/whisper/modeling_whisper.py +5 -3
  609. transformers/models/x_clip/modeling_x_clip.py +2 -0
  610. transformers/models/xcodec/modeling_xcodec.py +5 -0
  611. transformers/models/xglm/modeling_xglm.py +10 -0
  612. transformers/models/xlm/modeling_xlm.py +13 -14
  613. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  614. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  615. transformers/models/xlnet/modeling_xlnet.py +3 -1
  616. transformers/models/xmod/modeling_xmod.py +3 -0
  617. transformers/models/yoso/modeling_yoso.py +4 -1
  618. transformers/models/zamba/modeling_zamba.py +2 -1
  619. transformers/models/zamba2/modeling_zamba2.py +3 -2
  620. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  621. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  622. transformers/models/zoedepth/modeling_zoedepth.py +7 -0
  623. transformers/pipelines/__init__.py +9 -6
  624. transformers/pipelines/automatic_speech_recognition.py +20 -12
  625. transformers/pipelines/base.py +1 -1
  626. transformers/pipelines/document_question_answering.py +1 -1
  627. transformers/pipelines/question_answering.py +1 -1
  628. transformers/pipelines/text_to_audio.py +2 -2
  629. transformers/processing_utils.py +127 -56
  630. transformers/quantizers/auto.py +2 -4
  631. transformers/quantizers/base.py +9 -64
  632. transformers/quantizers/quantizer_aqlm.py +1 -18
  633. transformers/quantizers/quantizer_auto_round.py +1 -10
  634. transformers/quantizers/quantizer_awq.py +3 -8
  635. transformers/quantizers/quantizer_bitnet.py +1 -6
  636. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  637. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  638. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  639. transformers/quantizers/quantizer_eetq.py +2 -12
  640. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  641. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  642. transformers/quantizers/quantizer_fp_quant.py +4 -4
  643. transformers/quantizers/quantizer_gptq.py +1 -4
  644. transformers/quantizers/quantizer_higgs.py +2 -6
  645. transformers/quantizers/quantizer_mxfp4.py +2 -28
  646. transformers/quantizers/quantizer_quanto.py +14 -14
  647. transformers/quantizers/quantizer_spqr.py +3 -8
  648. transformers/quantizers/quantizer_torchao.py +28 -124
  649. transformers/quantizers/quantizer_vptq.py +1 -10
  650. transformers/testing_utils.py +28 -12
  651. transformers/tokenization_mistral_common.py +3 -2
  652. transformers/tokenization_utils_base.py +3 -2
  653. transformers/tokenization_utils_tokenizers.py +25 -2
  654. transformers/trainer.py +24 -2
  655. transformers/trainer_callback.py +8 -0
  656. transformers/trainer_seq2seq.py +4 -0
  657. transformers/training_args.py +8 -10
  658. transformers/utils/__init__.py +4 -0
  659. transformers/utils/attention_visualizer.py +4 -4
  660. transformers/utils/auto_docstring.py +34 -25
  661. transformers/utils/generic.py +20 -0
  662. transformers/utils/import_utils.py +51 -9
  663. transformers/utils/kernel_config.py +71 -18
  664. transformers/utils/quantization_config.py +8 -8
  665. transformers/video_processing_utils.py +16 -12
  666. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +5 -6
  667. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +671 -632
  668. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +0 -0
  669. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  670. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/licenses/LICENSE +0 -0
  671. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -25,7 +25,7 @@ from ... import initialization as init
25
25
  from ...cache_utils import Cache
26
26
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
27
27
  from ...processing_utils import Unpack
28
- from ...utils import TransformersKwargs, logging
28
+ from ...utils import TransformersKwargs, is_grouped_mm_available, logging
29
29
  from ..hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1RotaryEmbedding
30
30
  from ..llama.modeling_llama import (
31
31
  LlamaAttention,
@@ -177,7 +177,9 @@ class HunYuanMoEV1DecoderLayer(LlamaDecoderLayer):
177
177
 
178
178
 
179
179
  class HunYuanMoEV1PreTrainedModel(LlamaPreTrainedModel):
180
- _can_compile_fullgraph = False
180
+ _can_compile_fullgraph = (
181
+ is_grouped_mm_available()
182
+ ) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
181
183
 
182
184
  @torch.no_grad()
183
185
  def _init_weights(self, module):
@@ -593,16 +593,32 @@ class IBertPreTrainedModel(PreTrainedModel):
593
593
  init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
594
594
  if module.bias is not None:
595
595
  init.zeros_(module.bias)
596
+ if getattr(module, "weight_integer", None) is not None:
597
+ init.zeros_(module.weight_integer)
598
+ init.zeros_(module.fc_scaling_factor)
599
+ if getattr(module, "bias_integer", None) is not None:
600
+ init.zeros_(module.bias_integer)
596
601
  elif isinstance(module, (QuantEmbedding, nn.Embedding)):
597
602
  init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
598
603
  # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
599
604
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
600
605
  init.zeros_(module.weight[module.padding_idx])
606
+ if getattr(module, "weight_scaling_factor", None) is not None:
607
+ init.zeros_(module.weight_scaling_factor)
608
+ init.zeros_(module.weight_integer)
601
609
  elif isinstance(module, (IntLayerNorm, nn.LayerNorm)):
602
610
  init.zeros_(module.bias)
603
611
  init.ones_(module.weight)
612
+ if getattr(module, "shift", None) is not None:
613
+ init.zeros_(module.shift)
604
614
  elif isinstance(module, IBertLMHead):
605
615
  init.zeros_(module.bias)
616
+ elif isinstance(module, IBertEmbeddings):
617
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
618
+ elif isinstance(module, QuantAct):
619
+ init.constant_(module.x_min, -1e-5)
620
+ init.constant_(module.x_max, 1e-5)
621
+ init.zeros_(module.act_scaling_factor)
606
622
 
607
623
  def resize_token_embeddings(self, new_num_tokens=None):
608
624
  raise NotImplementedError("`resize_token_embeddings` is not supported for I-BERT.")
@@ -840,6 +840,7 @@ class IdeficsPreTrainedModel(PreTrainedModel):
840
840
  super()._init_weights(module)
841
841
  if isinstance(module, IdeficsVisionEmbeddings):
842
842
  init.normal_(module.class_embedding)
843
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
843
844
  elif isinstance(module, IdeficsGatedCrossAttentionLayer):
844
845
  if self.config.alpha_initializer == "zeros":
845
846
  init.zeros_(module.alpha_cross_attn)
@@ -852,6 +853,15 @@ class IdeficsPreTrainedModel(PreTrainedModel):
852
853
  init.normal_(module.alpha_dense, mean=0.0, std=self.config.alphas_initializer_range)
853
854
  elif isinstance(module, IdeficsPerceiverResampler):
854
855
  init.normal_(module.latents)
856
+ elif isinstance(module, IdeficsEmbedding):
857
+ inv_freq = 1.0 / (module.base ** (torch.arange(0, module.dim, 2) / module.dim))
858
+ init.copy_(module.inv_freq, inv_freq)
859
+ t = torch.arange(module.max_position_embeddings).type_as(inv_freq)
860
+ freqs = torch.einsum("i,j->ij", t, inv_freq)
861
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
862
+ emb = torch.cat((freqs, freqs), dim=-1)
863
+ init.copy_(module.cos_cached, emb.cos())
864
+ init.copy_(module.sin_cached, emb.sin())
855
865
 
856
866
 
857
867
  @auto_docstring
@@ -452,6 +452,8 @@ class Idefics2VisionTransformer(Idefics2PreTrainedModel):
452
452
  self.encoder = Idefics2Encoder(config)
453
453
  self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
454
454
 
455
+ self.post_init()
456
+
455
457
  def get_input_embeddings(self):
456
458
  return self.embeddings
457
459
 
@@ -711,6 +713,8 @@ class Idefics2PerceiverResampler(Idefics2PreTrainedModel):
711
713
  self.layers = nn.ModuleList([Idefics2PerceiverLayer(config, idx) for idx in range(self.depth)])
712
714
  self.norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
713
715
 
716
+ self.post_init()
717
+
714
718
  @auto_docstring
715
719
  def forward(
716
720
  self,
@@ -1115,6 +1119,7 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
1115
1119
  pixel_attention_mask=None,
1116
1120
  image_hidden_states=None,
1117
1121
  logits_to_keep=None,
1122
+ is_first_iteration=False,
1118
1123
  **kwargs,
1119
1124
  ):
1120
1125
  # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
@@ -1130,10 +1135,11 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
1130
1135
  pixel_attention_mask=pixel_attention_mask,
1131
1136
  image_hidden_states=image_hidden_states,
1132
1137
  logits_to_keep=logits_to_keep,
1138
+ is_first_iteration=is_first_iteration,
1133
1139
  **kwargs,
1134
1140
  )
1135
1141
 
1136
- if image_hidden_states is not None or cache_position[0] != 0:
1142
+ if image_hidden_states is not None or not is_first_iteration:
1137
1143
  model_inputs["pixel_values"] = None
1138
1144
  model_inputs["pixel_attention_mask"] = None
1139
1145
 
@@ -458,6 +458,8 @@ class Idefics3VisionTransformer(Idefics3PreTrainedModel):
458
458
  self.patch_size = config.patch_size
459
459
  self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
460
460
 
461
+ self.post_init()
462
+
461
463
  # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer.get_input_embeddings
462
464
  def get_input_embeddings(self):
463
465
  return self.embeddings
@@ -887,6 +889,7 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
887
889
  pixel_attention_mask=None,
888
890
  image_hidden_states=None,
889
891
  logits_to_keep=None,
892
+ is_first_iteration=False,
890
893
  **kwargs,
891
894
  ):
892
895
  # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
@@ -902,10 +905,11 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
902
905
  pixel_attention_mask=pixel_attention_mask,
903
906
  image_hidden_states=image_hidden_states,
904
907
  logits_to_keep=logits_to_keep,
908
+ is_first_iteration=is_first_iteration,
905
909
  **kwargs,
906
910
  )
907
911
 
908
- if image_hidden_states is not None or cache_position[0] != 0:
912
+ if image_hidden_states is not None or not is_first_iteration:
909
913
  model_inputs["pixel_values"] = None
910
914
  model_inputs["pixel_attention_mask"] = None
911
915
 
@@ -164,12 +164,8 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
164
164
 
165
165
  input_ids = reorder_images(input_ids_grouped, grouped_images_index)
166
166
 
167
- return BatchFeature(
168
- data={"input_ids": torch.stack(input_ids, dim=0) if return_tensors else input_ids},
169
- tensor_type=return_tensors,
170
- )
167
+ return BatchFeature(data={"input_ids": input_ids}, tensor_type=return_tensors)
171
168
 
172
- pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values
173
169
  return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
174
170
 
175
171
  def to_dict(self):
@@ -61,7 +61,7 @@ class ImageGPTLayerNorm(nn.Module):
61
61
  class ImageGPTAttention(nn.Module):
62
62
  def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx: Optional[int] = None):
63
63
  super().__init__()
64
-
64
+ self.config = config
65
65
  max_positions = config.max_position_embeddings
66
66
  self.register_buffer(
67
67
  "bias",
@@ -70,7 +70,6 @@ class ImageGPTAttention(nn.Module):
70
70
  ),
71
71
  persistent=False,
72
72
  )
73
- self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
74
73
 
75
74
  self.embed_dim = config.hidden_size
76
75
  self.num_heads = config.num_attention_heads
@@ -384,6 +383,14 @@ class ImageGPTPreTrainedModel(PreTrainedModel):
384
383
  if "c_proj" in name and "weight" in name:
385
384
  # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
386
385
  init.normal_(p, mean=0.0, std=self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
386
+ elif isinstance(module, ImageGPTAttention):
387
+ max_positions = module.config.max_position_embeddings
388
+ init.copy_(
389
+ module.bias,
390
+ torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
391
+ 1, 1, max_positions, max_positions
392
+ ),
393
+ )
387
394
 
388
395
 
389
396
  @auto_docstring
@@ -335,6 +335,8 @@ class InstructBlipPreTrainedModel(PreTrainedModel):
335
335
  init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
336
336
  elif isinstance(module, (InstructBlipForConditionalGeneration, InstructBlipModel)):
337
337
  init.zeros_(module.query_tokens)
338
+ elif isinstance(module, InstructBlipQFormerEmbeddings):
339
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
338
340
 
339
341
 
340
342
  # Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->InstructBlip
@@ -128,6 +128,56 @@ class InstructBlipVideoVisionEmbeddings(nn.Module):
128
128
  return embeddings
129
129
 
130
130
 
131
+ class InstructBlipVideoQFormerEmbeddings(nn.Module):
132
+ """Construct the embeddings from word and position embeddings."""
133
+
134
+ def __init__(self, config):
135
+ super().__init__()
136
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
137
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
138
+
139
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
140
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
141
+
142
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
143
+ self.register_buffer(
144
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
145
+ )
146
+
147
+ self.config = config
148
+
149
+ def forward(
150
+ self,
151
+ input_ids=None,
152
+ position_ids=None,
153
+ query_embeds=None,
154
+ past_key_values_length=0,
155
+ ):
156
+ if input_ids is not None:
157
+ seq_length = input_ids.size()[1]
158
+ else:
159
+ seq_length = 0
160
+
161
+ if position_ids is None:
162
+ position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
163
+
164
+ if input_ids is not None:
165
+ embeddings = self.word_embeddings(input_ids)
166
+
167
+ position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
168
+ embeddings = embeddings + position_embeddings
169
+
170
+ if query_embeds is not None:
171
+ embeddings = torch.cat((query_embeds, embeddings), dim=1)
172
+ else:
173
+ embeddings = query_embeds
174
+
175
+ embeddings = embeddings.to(self.layernorm.weight.dtype)
176
+ embeddings = self.layernorm(embeddings)
177
+ embeddings = self.dropout(embeddings)
178
+ return embeddings
179
+
180
+
131
181
  @auto_docstring
132
182
  class InstructBlipVideoPreTrainedModel(PreTrainedModel):
133
183
  config: InstructBlipVideoConfig
@@ -158,6 +208,8 @@ class InstructBlipVideoPreTrainedModel(PreTrainedModel):
158
208
  init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
159
209
  elif isinstance(module, (InstructBlipVideoForConditionalGeneration, InstructBlipVideoModel)):
160
210
  init.zeros_(module.query_tokens)
211
+ elif isinstance(module, InstructBlipVideoQFormerEmbeddings):
212
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
161
213
 
162
214
 
163
215
  # Adapted from transformers.models.siglip.modeling_siglip.eager_attention_forward -> InstructBlipVideo doesn't cast attn weights to fp32
@@ -677,56 +729,6 @@ class InstructBlipVideoQFormerEncoder(nn.Module):
677
729
  )
678
730
 
679
731
 
680
- class InstructBlipVideoQFormerEmbeddings(nn.Module):
681
- """Construct the embeddings from word and position embeddings."""
682
-
683
- def __init__(self, config):
684
- super().__init__()
685
- self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
686
- self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
687
-
688
- self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
689
- self.dropout = nn.Dropout(config.hidden_dropout_prob)
690
-
691
- # position_ids (1, len position emb) is contiguous in memory and exported when serialized
692
- self.register_buffer(
693
- "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
694
- )
695
-
696
- self.config = config
697
-
698
- def forward(
699
- self,
700
- input_ids=None,
701
- position_ids=None,
702
- query_embeds=None,
703
- past_key_values_length=0,
704
- ):
705
- if input_ids is not None:
706
- seq_length = input_ids.size()[1]
707
- else:
708
- seq_length = 0
709
-
710
- if position_ids is None:
711
- position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
712
-
713
- if input_ids is not None:
714
- embeddings = self.word_embeddings(input_ids)
715
-
716
- position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
717
- embeddings = embeddings + position_embeddings
718
-
719
- if query_embeds is not None:
720
- embeddings = torch.cat((query_embeds, embeddings), dim=1)
721
- else:
722
- embeddings = query_embeds
723
-
724
- embeddings = embeddings.to(self.layernorm.weight.dtype)
725
- embeddings = self.layernorm(embeddings)
726
- embeddings = self.dropout(embeddings)
727
- return embeddings
728
-
729
-
730
732
  class InstructBlipVideoQFormerModel(InstructBlipVideoPreTrainedModel):
731
733
  """
732
734
  Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
@@ -84,7 +84,6 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
84
84
  processed_videos_grouped[shape] = stacked_videos
85
85
 
86
86
  processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
87
- processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
88
87
 
89
88
  return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)
90
89
 
@@ -209,10 +209,9 @@ class InternVLVisionPatchEmbeddings(nn.Module):
209
209
  )
210
210
 
211
211
  embeddings = self.projection(pixel_values.to(self.projection.weight.dtype))
212
- patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
213
212
  embeddings = embeddings.flatten(2).transpose(1, 2)
214
213
 
215
- return embeddings, (patch_height, patch_width)
214
+ return embeddings
216
215
 
217
216
 
218
217
  # Based on timm implementation, which can be found here:
@@ -291,7 +290,7 @@ class InternVLVisionEmbeddings(nn.Module):
291
290
  bool_masked_pos: Optional[torch.BoolTensor] = None,
292
291
  ) -> torch.Tensor:
293
292
  _, _, height, width = pixel_values.shape
294
- embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
293
+ embeddings = self.patch_embeddings(pixel_values)
295
294
  batch_size, seq_len, _ = embeddings.size()
296
295
 
297
296
  if bool_masked_pos is not None:
@@ -308,7 +307,7 @@ class InternVLVisionEmbeddings(nn.Module):
308
307
 
309
308
  embeddings = self.dropout(embeddings)
310
309
 
311
- return embeddings, (patch_height, patch_width)
310
+ return embeddings
312
311
 
313
312
 
314
313
  class InternVLVisionMLP(nn.Module):
@@ -455,7 +454,7 @@ class InternVLVisionModel(InternVLVisionPreTrainedModel):
455
454
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
456
455
  Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
457
456
  """
458
- embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
457
+ embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
459
458
 
460
459
  encoder_outputs = self.encoder(embedding_output)
461
460
  sequence_output = encoder_outputs[0]
@@ -898,6 +897,7 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
898
897
  attention_mask=None,
899
898
  cache_position=None,
900
899
  logits_to_keep=None,
900
+ is_first_iteration=False,
901
901
  **kwargs,
902
902
  ):
903
903
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -909,12 +909,15 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin)
909
909
  attention_mask=attention_mask,
910
910
  cache_position=cache_position,
911
911
  logits_to_keep=logits_to_keep,
912
+ is_first_iteration=is_first_iteration,
912
913
  **kwargs,
913
914
  )
914
915
 
915
- if cache_position[0] == 0:
916
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
917
- # Otherwise we need pixel values to be passed to model
916
+ if is_first_iteration or not kwargs.get("use_cache", True):
917
+ # Pixel values are used only in the first iteration if available
918
+ # In subsquent iterations, they are already merged with text and cached
919
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
920
+ # iteration with a question and cached system prompt (continue generate from cache)
918
921
  model_inputs["pixel_values"] = pixel_values
919
922
 
920
923
  return model_inputs
@@ -29,7 +29,7 @@ from ...modeling_layers import GradientCheckpointingLayer
29
29
  from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
30
30
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
31
31
  from ...processing_utils import Unpack
32
- from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int
32
+ from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_int
33
33
  from ...utils.generic import check_model_inputs
34
34
  from ..clip.modeling_clip import CLIPMLP
35
35
  from ..janus.modeling_janus import JanusVisionAttention
@@ -44,9 +44,6 @@ from ..llava.modeling_llava import (
44
44
  from .configuration_internvl import InternVLConfig, InternVLVisionConfig
45
45
 
46
46
 
47
- logger = logging.get_logger(__name__)
48
-
49
-
50
47
  def eager_attention_forward(
51
48
  module: nn.Module,
52
49
  query: torch.Tensor,
@@ -177,10 +174,9 @@ class InternVLVisionPatchEmbeddings(nn.Module):
177
174
  )
178
175
 
179
176
  embeddings = self.projection(pixel_values.to(self.projection.weight.dtype))
180
- patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
181
177
  embeddings = embeddings.flatten(2).transpose(1, 2)
182
178
 
183
- return embeddings, (patch_height, patch_width)
179
+ return embeddings
184
180
 
185
181
 
186
182
  # Based on timm implementation, which can be found here:
@@ -259,7 +255,7 @@ class InternVLVisionEmbeddings(nn.Module):
259
255
  bool_masked_pos: Optional[torch.BoolTensor] = None,
260
256
  ) -> torch.Tensor:
261
257
  _, _, height, width = pixel_values.shape
262
- embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
258
+ embeddings = self.patch_embeddings(pixel_values)
263
259
  batch_size, seq_len, _ = embeddings.size()
264
260
 
265
261
  if bool_masked_pos is not None:
@@ -276,7 +272,7 @@ class InternVLVisionEmbeddings(nn.Module):
276
272
 
277
273
  embeddings = self.dropout(embeddings)
278
274
 
279
- return embeddings, (patch_height, patch_width)
275
+ return embeddings
280
276
 
281
277
 
282
278
  class InternVLVisionMLP(CLIPMLP):
@@ -412,7 +408,7 @@ class InternVLVisionModel(InternVLVisionPreTrainedModel):
412
408
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
413
409
  Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
414
410
  """
415
- embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
411
+ embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
416
412
 
417
413
  encoder_outputs = self.encoder(embedding_output)
418
414
  sequence_output = encoder_outputs[0]
@@ -140,7 +140,6 @@ class InternVLVideoProcessor(BaseVideoProcessor):
140
140
  processed_videos_grouped[shape] = stacked_videos
141
141
 
142
142
  processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
143
- processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
144
143
 
145
144
  return BatchFeature(data={"pixel_values_videos": processed_videos}, tensor_type=return_tensors)
146
145
 
@@ -0,0 +1,27 @@
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import TYPE_CHECKING
15
+
16
+ from ...utils import _LazyModule
17
+ from ...utils.import_utils import define_import_structure
18
+
19
+
20
+ if TYPE_CHECKING:
21
+ from .configuration_jais2 import *
22
+ from .modeling_jais2 import *
23
+ else:
24
+ import sys
25
+
26
+ _file = globals()["__file__"]
27
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
@@ -0,0 +1,152 @@
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/jais2/modular_jais2.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_jais2.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # coding=utf-8
8
+ # Copyright 2025 the HuggingFace Team. All rights reserved.
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+
22
+ from typing import Optional
23
+
24
+ from ...configuration_utils import PreTrainedConfig
25
+ from ...modeling_rope_utils import RopeParameters
26
+
27
+
28
+ class Jais2Config(PreTrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`Jais2Model`]. It is used to instantiate a Jais2
31
+ model according to the specified arguments, defining the model architecture.
32
+ [inceptionai/Jais-2-8B-Chat](https://huggingface.co/inceptionai/Jais-2-8B-Chat).
33
+
34
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
35
+ documentation from [`PreTrainedConfig`] for more information.
36
+
37
+ Args:
38
+ vocab_size (`int`, *optional*, defaults to 150272):
39
+ Vocabulary size of the Jais2 model.
40
+ hidden_size (`int`, *optional*, defaults to 3328):
41
+ Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 26624):
43
+ Dimension of the MLP representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 32):
45
+ Number of hidden layers in the Transformer decoder.
46
+ num_attention_heads (`int`, *optional*, defaults to 26):
47
+ Number of attention heads for each attention layer.
48
+ num_key_value_heads (`int`, *optional*):
49
+ Number of key_value heads for Grouped Query Attention.
50
+ hidden_act (`str`, *optional*, defaults to `"relu2"`):
51
+ The non-linear activation function in the decoder.
52
+ max_position_embeddings (`int`, *optional*, defaults to 8192):
53
+ The maximum sequence length.
54
+ initializer_range (`float`, *optional*, defaults to 0.02):
55
+ The standard deviation of the truncated_normal_initializer.
56
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
57
+ The epsilon used by the normalization layers.
58
+ use_cache (`bool`, *optional*, defaults to `True`):
59
+ Whether to return last key/values attentions.
60
+ pad_token_id (`int`, *optional*):
61
+ Padding token id.
62
+ bos_token_id (`int`, *optional*, defaults to 0):
63
+ Beginning of stream token id.
64
+ eos_token_id (`int`, *optional*, defaults to 150024):
65
+ End of stream token id.
66
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
67
+ Whether to tie weight embeddings.
68
+ attention_bias (`bool`, *optional*, defaults to `True`):
69
+ Whether to use a bias in the query, key, value and output projection layers.
70
+ attention_dropout (`float`, *optional*, defaults to 0.0):
71
+ The dropout ratio for the attention probabilities.
72
+ mlp_bias (`bool`, *optional*, defaults to `True`):
73
+ Whether to use a bias in up_proj, down_proj and gate_proj layers.
74
+ head_dim (`int`, *optional*):
75
+ The attention head dimension.
76
+ rope_parameters (`dict`, *optional*):
77
+ The RoPE parameters.
78
+ """
79
+
80
+ model_type = "jais2"
81
+ keys_to_ignore_at_inference = ["past_key_values"]
82
+
83
+ base_model_tp_plan = {
84
+ "layers.*.self_attn.q_proj": "colwise",
85
+ "layers.*.self_attn.k_proj": "colwise",
86
+ "layers.*.self_attn.v_proj": "colwise",
87
+ "layers.*.self_attn.o_proj": "rowwise",
88
+ "layers.*.mlp.up_proj": "colwise",
89
+ "layers.*.mlp.down_proj": "rowwise",
90
+ }
91
+ base_model_pp_plan = {
92
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
93
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
94
+ "norm": (["hidden_states"], ["hidden_states"]),
95
+ }
96
+
97
+ def __init__(
98
+ self,
99
+ vocab_size: Optional[int] = 150272,
100
+ hidden_size: Optional[int] = 3328,
101
+ intermediate_size: Optional[int] = 26624,
102
+ num_hidden_layers: Optional[int] = 32,
103
+ num_attention_heads: Optional[int] = 26,
104
+ num_key_value_heads: Optional[int] = None,
105
+ hidden_act: Optional[str] = "relu2",
106
+ max_position_embeddings: Optional[int] = 8192,
107
+ initializer_range: Optional[float] = 0.02,
108
+ layer_norm_eps: Optional[float] = 1e-5,
109
+ use_cache: Optional[bool] = True,
110
+ pad_token_id: Optional[int] = None,
111
+ bos_token_id: Optional[int] = 0,
112
+ eos_token_id: Optional[int] = 150024,
113
+ tie_word_embeddings: Optional[bool] = False,
114
+ attention_bias: Optional[bool] = True,
115
+ attention_dropout: Optional[float] = 0.0,
116
+ mlp_bias: Optional[bool] = True,
117
+ head_dim: Optional[int] = None,
118
+ rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
119
+ **kwargs,
120
+ ):
121
+ self.vocab_size = vocab_size
122
+ self.max_position_embeddings = max_position_embeddings
123
+ self.hidden_size = hidden_size
124
+ self.intermediate_size = intermediate_size
125
+ self.num_hidden_layers = num_hidden_layers
126
+ self.num_attention_heads = num_attention_heads
127
+
128
+ # for backward compatibility
129
+ if num_key_value_heads is None:
130
+ num_key_value_heads = num_attention_heads
131
+
132
+ self.num_key_value_heads = num_key_value_heads
133
+ self.hidden_act = hidden_act
134
+ self.initializer_range = initializer_range
135
+ self.use_cache = use_cache
136
+ self.attention_bias = attention_bias
137
+ self.attention_dropout = attention_dropout
138
+ self.mlp_bias = mlp_bias
139
+ self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
140
+ self.rope_parameters = rope_parameters
141
+
142
+ super().__init__(
143
+ pad_token_id=pad_token_id,
144
+ bos_token_id=bos_token_id,
145
+ eos_token_id=eos_token_id,
146
+ tie_word_embeddings=tie_word_embeddings,
147
+ **kwargs,
148
+ )
149
+ self.layer_norm_eps = layer_norm_eps
150
+
151
+
152
+ __all__ = ["Jais2Config"]