transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -215,6 +215,46 @@ class Siglip2VisionEmbeddings(nn.Module):
215
215
  return embeddings
216
216
 
217
217
 
218
+ class Siglip2TextEmbeddings(nn.Module):
219
+ def __init__(self, config: Siglip2TextConfig):
220
+ super().__init__()
221
+ embed_dim = config.hidden_size
222
+
223
+ self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
224
+ self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
225
+
226
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
227
+ self.register_buffer(
228
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
229
+ )
230
+
231
+ def forward(
232
+ self,
233
+ input_ids: Optional[torch.LongTensor] = None,
234
+ position_ids: Optional[torch.LongTensor] = None,
235
+ inputs_embeds: Optional[torch.FloatTensor] = None,
236
+ ) -> torch.Tensor:
237
+ seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
238
+ max_position_embedding = self.position_embedding.weight.shape[0]
239
+
240
+ if seq_length > max_position_embedding:
241
+ raise ValueError(
242
+ f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
243
+ f"{seq_length} and max_position_embeddings: {max_position_embedding}"
244
+ )
245
+
246
+ if position_ids is None:
247
+ position_ids = self.position_ids[:, :seq_length]
248
+
249
+ if inputs_embeds is None:
250
+ inputs_embeds = self.token_embedding(input_ids)
251
+
252
+ position_embeddings = self.position_embedding(position_ids)
253
+ embeddings = inputs_embeds + position_embeddings
254
+
255
+ return embeddings
256
+
257
+
218
258
  def eager_attention_forward(
219
259
  module: nn.Module,
220
260
  query: torch.Tensor,
@@ -412,6 +452,8 @@ class Siglip2PreTrainedModel(PreTrainedModel):
412
452
  else self.config.hidden_size
413
453
  )
414
454
  init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
455
+ if hasattr(module, "position_ids"):
456
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
415
457
  elif isinstance(module, nn.Embedding):
416
458
  default_flax_embed_init(module.weight)
417
459
  elif isinstance(module, Siglip2Attention):
@@ -447,6 +489,8 @@ class Siglip2PreTrainedModel(PreTrainedModel):
447
489
  elif isinstance(module, nn.LayerNorm):
448
490
  init.zeros_(module.bias)
449
491
  init.ones_(module.weight)
492
+ elif isinstance(module, Siglip2TextEmbeddings):
493
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
450
494
 
451
495
 
452
496
  class Siglip2Encoder(nn.Module):
@@ -484,6 +528,7 @@ class Siglip2Encoder(nn.Module):
484
528
 
485
529
 
486
530
  class Siglip2VisionTransformer(Siglip2PreTrainedModel):
531
+ _input_embed_layer = "patch_embedding"
487
532
  _can_record_outputs = {
488
533
  "hidden_states": Siglip2EncoderLayer,
489
534
  "attentions": Siglip2Attention,
@@ -501,6 +546,8 @@ class Siglip2VisionTransformer(Siglip2PreTrainedModel):
501
546
  if self.use_head:
502
547
  self.head = Siglip2MultiheadAttentionPoolingHead(config)
503
548
 
549
+ self.post_init()
550
+
504
551
  @check_model_inputs(tie_last_hidden_states=False)
505
552
  @auto_docstring
506
553
  def forward(
@@ -510,6 +557,7 @@ class Siglip2VisionTransformer(Siglip2PreTrainedModel):
510
557
  spatial_shapes: torch.LongTensor,
511
558
  output_attentions: Optional[bool] = None,
512
559
  output_hidden_states: Optional[bool] = None,
560
+ **kwargs,
513
561
  ) -> BaseModelOutputWithPooling:
514
562
  r"""
515
563
  spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
@@ -548,49 +596,11 @@ class Siglip2VisionTransformer(Siglip2PreTrainedModel):
548
596
  )
549
597
 
550
598
 
551
- class Siglip2TextEmbeddings(nn.Module):
552
- def __init__(self, config: Siglip2TextConfig):
553
- super().__init__()
554
- embed_dim = config.hidden_size
599
+ class Siglip2TextTransformer(Siglip2PreTrainedModel):
600
+ _input_embed_layer = "token_embedding"
555
601
 
556
- self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
557
- self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
558
-
559
- # position_ids (1, len position emb) is contiguous in memory and exported when serialized
560
- self.register_buffer(
561
- "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
562
- )
563
-
564
- def forward(
565
- self,
566
- input_ids: Optional[torch.LongTensor] = None,
567
- position_ids: Optional[torch.LongTensor] = None,
568
- inputs_embeds: Optional[torch.FloatTensor] = None,
569
- ) -> torch.Tensor:
570
- seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
571
- max_position_embedding = self.position_embedding.weight.shape[0]
572
-
573
- if seq_length > max_position_embedding:
574
- raise ValueError(
575
- f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
576
- f"{seq_length} and max_position_embeddings: {max_position_embedding}"
577
- )
578
-
579
- if position_ids is None:
580
- position_ids = self.position_ids[:, :seq_length]
581
-
582
- if inputs_embeds is None:
583
- inputs_embeds = self.token_embedding(input_ids)
584
-
585
- position_embeddings = self.position_embedding(position_ids)
586
- embeddings = inputs_embeds + position_embeddings
587
-
588
- return embeddings
589
-
590
-
591
- class Siglip2TextTransformer(nn.Module):
592
602
  def __init__(self, config: Siglip2TextConfig):
593
- super().__init__()
603
+ super().__init__(config)
594
604
  self.config = config
595
605
  embed_dim = config.hidden_size
596
606
  self.embeddings = Siglip2TextEmbeddings(config)
@@ -598,6 +608,7 @@ class Siglip2TextTransformer(nn.Module):
598
608
  self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
599
609
 
600
610
  self.head = nn.Linear(embed_dim, config.projection_size)
611
+ self.post_init()
601
612
 
602
613
  @can_return_tuple
603
614
  @auto_docstring
@@ -760,6 +771,7 @@ class Siglip2VisionModel(Siglip2PreTrainedModel):
760
771
  spatial_shapes: torch.LongTensor,
761
772
  output_attentions: Optional[bool] = None,
762
773
  output_hidden_states: Optional[bool] = None,
774
+ **kwargs,
763
775
  ) -> BaseModelOutputWithPooling:
764
776
  r"""
765
777
  pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
@@ -831,6 +843,12 @@ class Siglip2Model(Siglip2PreTrainedModel):
831
843
  # Initialize weights and apply final processing
832
844
  self.post_init()
833
845
 
846
+ def get_input_embeddings(self) -> nn.Module:
847
+ return self.text_model.embeddings.token_embedding
848
+
849
+ def set_input_embeddings(self, value: nn.Module):
850
+ self.text_model.embeddings.token_embedding = value
851
+
834
852
  @filter_out_non_signature_kwargs()
835
853
  @auto_docstring
836
854
  def get_text_features(
@@ -927,6 +945,7 @@ class Siglip2Model(Siglip2PreTrainedModel):
927
945
  return_loss: Optional[bool] = None,
928
946
  output_attentions: Optional[bool] = None,
929
947
  output_hidden_states: Optional[bool] = None,
948
+ **kwargs,
930
949
  ) -> Siglip2Output:
931
950
  r"""
932
951
  pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
@@ -1048,6 +1067,12 @@ class Siglip2ForImageClassification(Siglip2PreTrainedModel):
1048
1067
  # Initialize weights and apply final processing
1049
1068
  self.post_init()
1050
1069
 
1070
+ def get_input_embeddings(self) -> nn.Module:
1071
+ return self.vision_model.embeddings.patch_embedding
1072
+
1073
+ def set_input_embeddings(self, value: nn.Module):
1074
+ self.vision_model.embeddings.patch_embedding = value
1075
+
1051
1076
  @check_model_inputs
1052
1077
  @auto_docstring
1053
1078
  def forward(
@@ -1058,6 +1083,7 @@ class Siglip2ForImageClassification(Siglip2PreTrainedModel):
1058
1083
  labels: Optional[torch.Tensor] = None,
1059
1084
  output_attentions: Optional[bool] = None,
1060
1085
  output_hidden_states: Optional[bool] = None,
1086
+ **kwargs,
1061
1087
  ) -> ImageClassifierOutput:
1062
1088
  r"""
1063
1089
  pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
@@ -247,6 +247,7 @@ class Siglip2VisionTransformer(SiglipVisionTransformer):
247
247
  spatial_shapes: torch.LongTensor,
248
248
  output_attentions: Optional[bool] = None,
249
249
  output_hidden_states: Optional[bool] = None,
250
+ **kwargs,
250
251
  ) -> BaseModelOutputWithPooling:
251
252
  r"""
252
253
  spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
@@ -324,6 +325,7 @@ class Siglip2VisionModel(SiglipVisionModel):
324
325
  spatial_shapes: torch.LongTensor,
325
326
  output_attentions: Optional[bool] = None,
326
327
  output_hidden_states: Optional[bool] = None,
328
+ **kwargs,
327
329
  ) -> BaseModelOutputWithPooling:
328
330
  r"""
329
331
  pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
@@ -419,6 +421,7 @@ class Siglip2Model(SiglipModel):
419
421
  return_loss: Optional[bool] = None,
420
422
  output_attentions: Optional[bool] = None,
421
423
  output_hidden_states: Optional[bool] = None,
424
+ **kwargs,
422
425
  ) -> Siglip2Output:
423
426
  r"""
424
427
  pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
@@ -522,6 +525,7 @@ class Siglip2ForImageClassification(SiglipForImageClassification):
522
525
  labels: Optional[torch.Tensor] = None,
523
526
  output_attentions: Optional[bool] = None,
524
527
  output_hidden_states: Optional[bool] = None,
528
+ **kwargs,
525
529
  ) -> ImageClassifierOutput:
526
530
  r"""
527
531
  pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
@@ -28,7 +28,7 @@ from torch import nn
28
28
  from ...activations import ACT2FN
29
29
  from ...cache_utils import Cache, DynamicCache
30
30
  from ...generation import GenerationMixin
31
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
31
+ from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
32
32
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
33
33
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
34
34
  from ...modeling_layers import (
@@ -42,7 +42,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
42
42
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
43
43
  from ...processing_utils import Unpack
44
44
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
45
- from ...utils.generic import check_model_inputs
45
+ from ...utils.generic import check_model_inputs, maybe_autocast
46
46
  from .configuration_smollm3 import SmolLM3Config
47
47
 
48
48
 
@@ -63,7 +63,7 @@ class SmolLM3RotaryEmbedding(nn.Module):
63
63
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
64
64
 
65
65
  self.register_buffer("inv_freq", inv_freq, persistent=False)
66
- self.original_inv_freq = inv_freq
66
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
67
67
 
68
68
  @staticmethod
69
69
  def compute_default_rope_parameters(
@@ -102,7 +102,7 @@ class SmolLM3RotaryEmbedding(nn.Module):
102
102
  position_ids_expanded = position_ids[:, None, :].float()
103
103
 
104
104
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
105
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
105
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
106
106
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
107
107
  emb = torch.cat((freqs, freqs), dim=-1)
108
108
  cos = emb.cos() * self.attention_scaling
@@ -184,6 +184,7 @@ def eager_attention_forward(
184
184
  return attn_output, attn_weights
185
185
 
186
186
 
187
+ @use_kernelized_func(apply_rotary_pos_emb)
187
188
  class SmolLM3Attention(nn.Module):
188
189
  """Multi-headed attention from 'Attention Is All You Need' paper"""
189
190
 
@@ -209,7 +210,6 @@ class SmolLM3Attention(nn.Module):
209
210
  self.o_proj = nn.Linear(
210
211
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
211
212
  )
212
- self.rotary_fn = apply_rotary_pos_emb
213
213
 
214
214
  self.use_rope = config.no_rope_layers[layer_idx]
215
215
  self.sliding_window = (
@@ -330,6 +330,8 @@ class SmolVLMVisionTransformer(SmolVLMPreTrainedModel):
330
330
  self.patch_size = config.patch_size
331
331
  self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
332
332
 
333
+ self.post_init()
334
+
333
335
  def get_input_embeddings(self):
334
336
  return self.embeddings
335
337
 
@@ -853,6 +855,7 @@ class SmolVLMForConditionalGeneration(SmolVLMPreTrainedModel, GenerationMixin):
853
855
  pixel_attention_mask=None,
854
856
  image_hidden_states=None,
855
857
  logits_to_keep=None,
858
+ is_first_iteration=False,
856
859
  **kwargs,
857
860
  ):
858
861
  # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
@@ -868,10 +871,11 @@ class SmolVLMForConditionalGeneration(SmolVLMPreTrainedModel, GenerationMixin):
868
871
  pixel_attention_mask=pixel_attention_mask,
869
872
  image_hidden_states=image_hidden_states,
870
873
  logits_to_keep=logits_to_keep,
874
+ is_first_iteration=is_first_iteration,
871
875
  **kwargs,
872
876
  )
873
877
 
874
- if image_hidden_states is not None or cache_position[0] != 0:
878
+ if image_hidden_states is not None or not is_first_iteration:
875
879
  model_inputs["pixel_values"] = None
876
880
  model_inputs["pixel_attention_mask"] = None
877
881
 
@@ -27,13 +27,6 @@ from ...utils import is_num2words_available, is_vision_available, logging
27
27
  from ...video_utils import VideoInput
28
28
 
29
29
 
30
- if is_vision_available():
31
- from .video_processing_smolvlm import (
32
- DEFAULT_MEDIA_OUTTRO,
33
- DEFAULT_VIDEO_INTRO,
34
- FRAME_TIMESTAMP_MESSAGE,
35
- )
36
-
37
30
  if is_vision_available():
38
31
  from .video_processing_smolvlm import (
39
32
  DEFAULT_MEDIA_OUTTRO,
@@ -331,7 +331,6 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):
331
331
  processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
332
332
  pixel_attention_mask = reorder_videos(processed_padded_mask_grouped, grouped_videos_index)
333
333
 
334
- processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
335
334
  data = {"pixel_values": processed_videos}
336
335
 
337
336
  if do_pad:
@@ -22,6 +22,7 @@ import torch
22
22
  from torch import nn
23
23
  from torch.nn import CrossEntropyLoss
24
24
 
25
+ from ... import initialization as init
25
26
  from ...activations import ACT2FN
26
27
  from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
27
28
  from ...generation import GenerationMixin
@@ -105,6 +106,7 @@ class Speech2TextSinusoidalPositionalEmbedding(nn.Module):
105
106
  def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
106
107
  super().__init__()
107
108
  self.offset = 2
109
+ self.num_positions = num_positions
108
110
  self.embedding_dim = embedding_dim
109
111
  self.padding_idx = padding_idx
110
112
  self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
@@ -495,6 +497,14 @@ class Speech2TextPreTrainedModel(PreTrainedModel):
495
497
  _supports_sdpa = False
496
498
  _supports_flex_attn = False
497
499
 
500
+ def _init_weights(self, module):
501
+ super()._init_weights(module)
502
+ if isinstance(module, Speech2TextSinusoidalPositionalEmbedding):
503
+ emb_weights = module.get_embedding(
504
+ module.num_positions + module.offset, module.embedding_dim, module.padding_idx
505
+ )
506
+ init.copy_(module.weights, emb_weights)
507
+
498
508
  def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
499
509
  """
500
510
  Computes the output length of the convolutional layers
@@ -567,6 +577,7 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
567
577
  output_attentions=None,
568
578
  output_hidden_states=None,
569
579
  return_dict=None,
580
+ **kwargs,
570
581
  ):
571
582
  r"""
572
583
  Args:
@@ -707,6 +718,7 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
707
718
  output_hidden_states=None,
708
719
  return_dict=None,
709
720
  cache_position=None,
721
+ **kwargs,
710
722
  ):
711
723
  r"""
712
724
  Args:
@@ -899,6 +911,7 @@ class Speech2TextModel(Speech2TextPreTrainedModel):
899
911
  output_hidden_states: Optional[bool] = None,
900
912
  return_dict: Optional[bool] = None,
901
913
  cache_position: Optional[torch.Tensor] = None,
914
+ **kwargs,
902
915
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
903
916
  r"""
904
917
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1035,6 +1048,7 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel, Generation
1035
1048
  output_hidden_states: Optional[bool] = None,
1036
1049
  return_dict: Optional[bool] = None,
1037
1050
  cache_position: Optional[torch.Tensor] = None,
1051
+ **kwargs,
1038
1052
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
1039
1053
  r"""
1040
1054
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -290,6 +290,7 @@ class SpeechT5SinusoidalPositionalEmbedding(nn.Module):
290
290
  def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
291
291
  super().__init__()
292
292
  self.offset = 2
293
+ self.num_positions = num_positions
293
294
  self.embedding_dim = embedding_dim
294
295
  self.padding_idx = padding_idx
295
296
  self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
@@ -414,6 +415,7 @@ class SpeechT5ScaledPositionalEncoding(nn.Module):
414
415
  self.register_buffer("pe", pe, persistent=False)
415
416
  self.dropout = nn.Dropout(p=dropout)
416
417
  self.dim = dim
418
+ self.max_len = max_len
417
419
  self.alpha = nn.Parameter(torch.tensor(1.0))
418
420
 
419
421
  def forward(self, emb):
@@ -1184,6 +1186,14 @@ class SpeechT5PreTrainedModel(PreTrainedModel):
1184
1186
  init.constant_(module.conv.bias, 0)
1185
1187
  elif isinstance(module, SpeechT5ScaledPositionalEncoding):
1186
1188
  init.ones_(module.alpha)
1189
+ dim, max_len = module.dim, module.max_len
1190
+ pe = torch.zeros(max_len, dim)
1191
+ position = torch.arange(0, max_len).unsqueeze(1)
1192
+ div_term = torch.exp(torch.arange(0, dim, 2, dtype=torch.int64).float() * -(math.log(10000.0) / dim))
1193
+ pe[:, 0::2] = torch.sin(position.float() * div_term)
1194
+ pe[:, 1::2] = torch.cos(position.float() * div_term)
1195
+ pe = pe.unsqueeze(0)
1196
+ init.copy_(module.pe, pe)
1187
1197
  elif isinstance(module, SpeechT5FeatureProjection):
1188
1198
  k = math.sqrt(1 / module.projection.in_features)
1189
1199
  init.uniform_(module.projection.weight, a=-k, b=k)
@@ -1195,6 +1205,10 @@ class SpeechT5PreTrainedModel(PreTrainedModel):
1195
1205
  elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm1d)):
1196
1206
  init.zeros_(module.bias)
1197
1207
  init.ones_(module.weight)
1208
+ if getattr(module, "running_mean", None) is not None:
1209
+ init.zeros_(module.running_mean)
1210
+ init.ones_(module.running_var)
1211
+ init.zeros_(module.num_batches_tracked)
1198
1212
  elif isinstance(module, nn.Conv1d):
1199
1213
  init.kaiming_normal_(module.weight)
1200
1214
  if module.bias is not None:
@@ -1205,6 +1219,14 @@ class SpeechT5PreTrainedModel(PreTrainedModel):
1205
1219
  # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
1206
1220
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
1207
1221
  init.zeros_(module.weight[module.padding_idx])
1222
+ elif isinstance(module, SpeechT5SinusoidalPositionalEmbedding):
1223
+ emb_weights = module.get_embedding(
1224
+ module.num_positions + module.offset, module.embedding_dim, module.padding_idx
1225
+ )
1226
+ init.copy_(module.weights, emb_weights)
1227
+ elif isinstance(module, SpeechT5HifiGan):
1228
+ init.zeros_(module.mean)
1229
+ init.ones_(module.scale)
1208
1230
 
1209
1231
  if hasattr(module, "masked_spec_embed"):
1210
1232
  init.uniform_(module.masked_spec_embed)
@@ -1239,6 +1261,7 @@ class SpeechT5Encoder(SpeechT5PreTrainedModel):
1239
1261
  output_attentions: Optional[bool] = None,
1240
1262
  output_hidden_states: Optional[bool] = None,
1241
1263
  return_dict: Optional[bool] = None,
1264
+ **kwargs,
1242
1265
  ) -> Union[tuple, BaseModelOutput]:
1243
1266
  """
1244
1267
  Args:
@@ -1342,6 +1365,7 @@ class SpeechT5EncoderWithSpeechPrenet(SpeechT5PreTrainedModel):
1342
1365
  output_attentions: Optional[bool] = None,
1343
1366
  output_hidden_states: Optional[bool] = None,
1344
1367
  return_dict: Optional[bool] = None,
1368
+ **kwargs,
1345
1369
  ) -> Union[tuple, BaseModelOutput]:
1346
1370
  hidden_states, attention_mask = self.prenet(input_values, attention_mask)
1347
1371
 
@@ -1382,6 +1406,7 @@ class SpeechT5EncoderWithTextPrenet(SpeechT5PreTrainedModel):
1382
1406
  output_attentions: Optional[bool] = None,
1383
1407
  output_hidden_states: Optional[bool] = None,
1384
1408
  return_dict: Optional[bool] = None,
1409
+ **kwargs,
1385
1410
  ) -> Union[tuple, BaseModelOutput]:
1386
1411
  hidden_states = self.prenet(input_values)
1387
1412
 
@@ -1416,6 +1441,7 @@ class SpeechT5EncoderWithoutPrenet(SpeechT5PreTrainedModel):
1416
1441
  output_attentions: Optional[bool] = None,
1417
1442
  output_hidden_states: Optional[bool] = None,
1418
1443
  return_dict: Optional[bool] = None,
1444
+ **kwargs,
1419
1445
  ) -> Union[tuple, BaseModelOutput]:
1420
1446
  return self.wrapped_encoder(
1421
1447
  hidden_states=input_values,
@@ -1454,6 +1480,7 @@ class SpeechT5Decoder(SpeechT5PreTrainedModel):
1454
1480
  output_hidden_states: Optional[bool] = None,
1455
1481
  return_dict: Optional[bool] = None,
1456
1482
  cache_position: Optional[torch.Tensor] = None,
1483
+ **kwargs,
1457
1484
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
1458
1485
  r"""
1459
1486
  Args:
@@ -1613,6 +1640,7 @@ class SpeechT5DecoderWithSpeechPrenet(SpeechT5PreTrainedModel):
1613
1640
  output_hidden_states: Optional[bool] = None,
1614
1641
  return_dict: Optional[bool] = None,
1615
1642
  cache_position: Optional[torch.Tensor] = None,
1643
+ **kwargs,
1616
1644
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
1617
1645
  decoder_hidden_states = self.prenet(input_values, speaker_embeddings)
1618
1646
 
@@ -1663,6 +1691,7 @@ class SpeechT5DecoderWithTextPrenet(SpeechT5PreTrainedModel):
1663
1691
  output_hidden_states: Optional[bool] = None,
1664
1692
  return_dict: Optional[bool] = None,
1665
1693
  cache_position: Optional[torch.Tensor] = None,
1694
+ **kwargs,
1666
1695
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
1667
1696
  decoder_hidden_states, attention_mask = self.prenet(input_values, attention_mask, past_key_values)
1668
1697
 
@@ -1707,6 +1736,7 @@ class SpeechT5DecoderWithoutPrenet(SpeechT5PreTrainedModel):
1707
1736
  output_hidden_states: Optional[bool] = None,
1708
1737
  return_dict: Optional[bool] = None,
1709
1738
  cache_position: Optional[torch.Tensor] = None,
1739
+ **kwargs,
1710
1740
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
1711
1741
  outputs = self.wrapped_decoder(
1712
1742
  hidden_states=input_values,
@@ -1905,6 +1935,7 @@ class SpeechT5Model(SpeechT5PreTrainedModel):
1905
1935
  output_hidden_states: Optional[bool] = None,
1906
1936
  return_dict: Optional[bool] = None,
1907
1937
  cache_position: Optional[torch.Tensor] = None,
1938
+ **kwargs,
1908
1939
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
1909
1940
  r"""
1910
1941
  input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
@@ -2046,6 +2077,7 @@ class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel, GenerationMixin):
2046
2077
  return_dict: Optional[bool] = None,
2047
2078
  labels: Optional[torch.LongTensor] = None,
2048
2079
  cache_position: Optional[torch.Tensor] = None,
2080
+ **kwargs,
2049
2081
  ) -> Union[tuple, Seq2SeqLMOutput]:
2050
2082
  r"""
2051
2083
  input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -2356,6 +2388,7 @@ class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
2356
2388
  labels: Optional[torch.FloatTensor] = None,
2357
2389
  stop_labels: Optional[torch.Tensor] = None,
2358
2390
  cache_position: Optional[torch.Tensor] = None,
2391
+ **kwargs,
2359
2392
  ) -> Union[tuple, Seq2SeqSpectrogramOutput]:
2360
2393
  r"""
2361
2394
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -2694,6 +2727,7 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
2694
2727
  labels: Optional[torch.FloatTensor] = None,
2695
2728
  stop_labels: Optional[torch.Tensor] = None,
2696
2729
  cache_position: Optional[torch.Tensor] = None,
2730
+ **kwargs,
2697
2731
  ) -> Union[tuple, Seq2SeqSpectrogramOutput]:
2698
2732
  r"""
2699
2733
  input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -2996,6 +3030,12 @@ class SpeechT5HifiGan(PreTrainedModel):
2996
3030
  # Initialize weights and apply final processing
2997
3031
  self.post_init()
2998
3032
 
3033
+ def _init_weights(self, module):
3034
+ super()._init_weights(module)
3035
+ if isinstance(module, SpeechT5HifiGan):
3036
+ init.zeros_(module.mean)
3037
+ init.ones_(module.scale)
3038
+
2999
3039
  def apply_weight_norm(self):
3000
3040
  weight_norm = nn.utils.weight_norm
3001
3041
  if hasattr(nn.utils.parametrizations, "weight_norm"):
@@ -3023,7 +3063,7 @@ class SpeechT5HifiGan(PreTrainedModel):
3023
3063
  waveform.
3024
3064
  """
3025
3065
  )
3026
- def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
3066
+ def forward(self, spectrogram: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
3027
3067
  r"""
3028
3068
  spectrogram (`torch.FloatTensor`):
3029
3069
  Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
@@ -22,6 +22,7 @@ import torch
22
22
  from torch import nn
23
23
  from torch.nn import CrossEntropyLoss
24
24
 
25
+ from ... import initialization as init
25
26
  from ...activations import ACT2FN
26
27
  from ...modeling_layers import GradientCheckpointingLayer
27
28
  from ...modeling_outputs import BaseModelOutput, ModelOutput, QuestionAnsweringModelOutput
@@ -305,9 +306,9 @@ class SplinterEncoder(nn.Module):
305
306
  all_hidden_states = all_hidden_states + (hidden_states,)
306
307
 
307
308
  layer_outputs = layer_module(
308
- hidden_states=hidden_states,
309
- attention_mask=attention_mask,
310
- output_attentions=output_attentions,
309
+ hidden_states,
310
+ attention_mask,
311
+ output_attentions,
311
312
  **kwargs,
312
313
  )
313
314
 
@@ -331,6 +332,11 @@ class SplinterPreTrainedModel(PreTrainedModel):
331
332
  base_model_prefix = "splinter"
332
333
  supports_gradient_checkpointing = True
333
334
 
335
+ def _init_weights(self, module):
336
+ super()._init_weights(module)
337
+ if isinstance(module, SplinterEmbeddings):
338
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
339
+
334
340
 
335
341
  @auto_docstring
336
342
  class SplinterModel(SplinterPreTrainedModel):
@@ -368,6 +374,7 @@ class SplinterModel(SplinterPreTrainedModel):
368
374
  output_attentions: Optional[bool] = None,
369
375
  output_hidden_states: Optional[bool] = None,
370
376
  return_dict: Optional[bool] = None,
377
+ **kwargs,
371
378
  ) -> Union[tuple, BaseModelOutput]:
372
379
  r"""
373
380
  token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
@@ -516,6 +523,7 @@ class SplinterForQuestionAnswering(SplinterPreTrainedModel):
516
523
  output_hidden_states: Optional[bool] = None,
517
524
  return_dict: Optional[bool] = None,
518
525
  question_positions: Optional[torch.LongTensor] = None,
526
+ **kwargs,
519
527
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
520
528
  r"""
521
529
  token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
@@ -658,6 +666,7 @@ class SplinterForPreTraining(SplinterPreTrainedModel):
658
666
  output_hidden_states: Optional[bool] = None,
659
667
  return_dict: Optional[bool] = None,
660
668
  question_positions: Optional[torch.LongTensor] = None,
669
+ **kwargs,
661
670
  ) -> Union[tuple, SplinterForPreTrainingOutput]:
662
671
  r"""
663
672
  input_ids (`torch.LongTensor` of shape `(batch_size, num_questions, sequence_length)`):