transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,7 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- from typing import Optional
16
+ from typing import Optional, Union
17
17
 
18
18
  from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
19
19
  from tokenizers.models import BPE
@@ -54,19 +54,20 @@ class HerbertTokenizer(TokenizersBackend):
54
54
  The mask token.
55
55
  sep_token (`str`, *optional*, defaults to `"</s>"`):
56
56
  The separator token.
57
- vocab (`dict`, *optional*):
57
+ vocab (`str`, `dict` or `list`, *optional*):
58
58
  Custom vocabulary dictionary.
59
- merges (`list`, *optional*):
59
+ merges (`str` or `list[str]`, *optional*):
60
60
  Custom merges list.
61
61
  """
62
62
 
63
63
  vocab_files_names = VOCAB_FILES_NAMES
64
- slow_tokenizer_class = None
64
+ model_input_names = ["input_ids", "attention_mask"]
65
+ model = BPE
65
66
 
66
67
  def __init__(
67
68
  self,
68
- vocab: Optional[dict] = None,
69
- merges: Optional[list] = None,
69
+ vocab: Optional[Union[str, dict[str, int]]] = None,
70
+ merges: Optional[Union[str, list[str]]] = None,
70
71
  cls_token: str = "<s>",
71
72
  unk_token: str = "<unk>",
72
73
  pad_token: str = "<pad>",
@@ -76,19 +77,8 @@ class HerbertTokenizer(TokenizersBackend):
76
77
  merges_file: Optional[str] = None,
77
78
  **kwargs,
78
79
  ):
79
- if vocab is not None:
80
- self._vocab = (
81
- {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
82
- )
83
- else:
84
- self._vocab = {}
85
-
86
- if merges is not None:
87
- # Convert lists to tuples if necessary (happens when loading from JSON)
88
- self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
89
- else:
90
- self._merges = []
91
-
80
+ self._vocab = vocab if vocab is not None else {str(unk_token): 0}
81
+ self._merges = merges or []
92
82
  self._tokenizer = Tokenizer(
93
83
  BPE(
94
84
  vocab=self._vocab,
@@ -105,13 +95,7 @@ class HerbertTokenizer(TokenizersBackend):
105
95
  self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
106
96
  self._tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")
107
97
 
108
- tokenizer_object = self._tokenizer
109
-
110
- self.vocab_file = vocab_file
111
- self.merges_file = merges_file
112
-
113
98
  super().__init__(
114
- tokenizer_object=tokenizer_object,
115
99
  cls_token=cls_token,
116
100
  unk_token=unk_token,
117
101
  pad_token=pad_token,
@@ -26,6 +26,7 @@ import torch
26
26
  import torch.nn.functional as F
27
27
  from torch import Tensor, nn
28
28
 
29
+ from ... import initialization as init
29
30
  from ...activations import ACT2FN
30
31
  from ...modeling_outputs import BackboneOutput, BaseModelOutputWithNoAttention, ImageClassifierOutputWithNoAttention
31
32
  from ...modeling_utils import PreTrainedModel
@@ -45,6 +46,15 @@ class HGNetV2PreTrainedModel(PreTrainedModel):
45
46
  input_modalities = ("image",)
46
47
  _no_split_modules = ["HGNetV2BasicLayer"]
47
48
 
49
+ def _init_weights(self, module):
50
+ super()._init_weights(module)
51
+ # We need to check it like that as d_fine models replace the BatchNorm2d by their own
52
+ if "BatchNorm" in module.__class__.__name__:
53
+ init.ones_(module.weight)
54
+ init.zeros_(module.bias)
55
+ init.zeros_(module.running_mean)
56
+ init.ones_(module.running_var)
57
+
48
58
 
49
59
  class HGNetV2LearnableAffineBlock(nn.Module):
50
60
  def __init__(self, scale_value: float = 1.0, bias_value: float = 0.0):
@@ -347,7 +357,11 @@ class HGNetV2Backbone(HGNetV2PreTrainedModel, BackboneMixin):
347
357
 
348
358
  @auto_docstring
349
359
  def forward(
350
- self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
360
+ self,
361
+ pixel_values: Tensor,
362
+ output_hidden_states: Optional[bool] = None,
363
+ return_dict: Optional[bool] = None,
364
+ **kwargs,
351
365
  ) -> BackboneOutput:
352
366
  r"""
353
367
  Examples:
@@ -426,6 +440,7 @@ class HGNetV2ForImageClassification(HGNetV2PreTrainedModel):
426
440
  labels: Optional[torch.LongTensor] = None,
427
441
  output_hidden_states: Optional[bool] = None,
428
442
  return_dict: Optional[bool] = None,
443
+ **kwargs,
429
444
  ) -> ImageClassifierOutputWithNoAttention:
430
445
  r"""
431
446
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -20,6 +20,7 @@ import torch
20
20
  import torch.nn.functional as F
21
21
  from torch import Tensor, nn
22
22
 
23
+ from ... import initialization as init
23
24
  from ...configuration_utils import PreTrainedConfig
24
25
  from ...modeling_outputs import (
25
26
  BackboneOutput,
@@ -170,6 +171,15 @@ class HGNetV2PreTrainedModel(PreTrainedModel):
170
171
  input_modalities = ("image",)
171
172
  _no_split_modules = ["HGNetV2BasicLayer"]
172
173
 
174
+ def _init_weights(self, module):
175
+ super()._init_weights(module)
176
+ # We need to check it like that as d_fine models replace the BatchNorm2d by their own
177
+ if "BatchNorm" in module.__class__.__name__:
178
+ init.ones_(module.weight)
179
+ init.zeros_(module.bias)
180
+ init.zeros_(module.running_mean)
181
+ init.ones_(module.running_var)
182
+
173
183
 
174
184
  class HGNetV2LearnableAffineBlock(nn.Module):
175
185
  def __init__(self, scale_value: float = 1.0, bias_value: float = 0.0):
@@ -470,7 +480,11 @@ class HGNetV2Backbone(HGNetV2PreTrainedModel, BackboneMixin):
470
480
 
471
481
  @auto_docstring
472
482
  def forward(
473
- self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
483
+ self,
484
+ pixel_values: Tensor,
485
+ output_hidden_states: Optional[bool] = None,
486
+ return_dict: Optional[bool] = None,
487
+ **kwargs,
474
488
  ) -> BackboneOutput:
475
489
  r"""
476
490
  Examples:
@@ -549,6 +563,7 @@ class HGNetV2ForImageClassification(HGNetV2PreTrainedModel):
549
563
  labels: Optional[torch.LongTensor] = None,
550
564
  output_hidden_states: Optional[bool] = None,
551
565
  return_dict: Optional[bool] = None,
566
+ **kwargs,
552
567
  ) -> ImageClassifierOutputWithNoAttention:
553
568
  r"""
554
569
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -848,6 +848,7 @@ class HieraModel(HieraPreTrainedModel):
848
848
  output_hidden_states: Optional[bool] = None,
849
849
  interpolate_pos_encoding: Optional[bool] = None,
850
850
  return_dict: Optional[bool] = None,
851
+ **kwargs,
851
852
  ) -> Union[tuple, BaseModelOutputWithPooling]:
852
853
  r"""
853
854
  noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*):
@@ -1132,6 +1133,7 @@ class HieraForPreTraining(HieraPreTrainedModel):
1132
1133
  output_hidden_states: Optional[bool] = None,
1133
1134
  interpolate_pos_encoding: Optional[bool] = None,
1134
1135
  return_dict: Optional[bool] = None,
1136
+ **kwargs,
1135
1137
  ) -> Union[tuple, HieraForPreTrainingOutput]:
1136
1138
  r"""
1137
1139
  noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*):
@@ -1249,6 +1251,7 @@ class HieraForImageClassification(HieraPreTrainedModel):
1249
1251
  output_hidden_states: Optional[bool] = None,
1250
1252
  interpolate_pos_encoding: Optional[bool] = None,
1251
1253
  return_dict: Optional[bool] = None,
1254
+ **kwargs,
1252
1255
  ) -> Union[tuple, HieraForImageClassificationOutput]:
1253
1256
  r"""
1254
1257
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1325,6 +1328,7 @@ class HieraBackbone(HieraPreTrainedModel, BackboneMixin):
1325
1328
  output_hidden_states: Optional[bool] = None,
1326
1329
  output_attentions: Optional[bool] = None,
1327
1330
  return_dict: Optional[bool] = None,
1331
+ **kwargs,
1328
1332
  ) -> BackboneOutput:
1329
1333
  """
1330
1334
  Returns:
@@ -648,6 +648,10 @@ class HubertPreTrainedModel(PreTrainedModel):
648
648
  elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm1d)):
649
649
  init.zeros_(module.bias)
650
650
  init.ones_(module.weight)
651
+ if getattr(module, "running_mean", None) is not None:
652
+ init.zeros_(module.running_mean)
653
+ init.ones_(module.running_var)
654
+ init.zeros_(module.num_batches_tracked)
651
655
  elif isinstance(module, nn.Conv1d):
652
656
  if is_deepspeed_zero3_enabled():
653
657
  import deepspeed
@@ -892,6 +896,7 @@ class HubertModel(HubertPreTrainedModel):
892
896
  output_attentions: Optional[bool] = None,
893
897
  output_hidden_states: Optional[bool] = None,
894
898
  return_dict: Optional[bool] = None,
899
+ **kwargs,
895
900
  ) -> Union[tuple, BaseModelOutput]:
896
901
  r"""
897
902
  mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1038,6 +1043,7 @@ class HubertForCTC(HubertPreTrainedModel):
1038
1043
  output_hidden_states: Optional[bool] = None,
1039
1044
  return_dict: Optional[bool] = None,
1040
1045
  labels: Optional[torch.Tensor] = None,
1046
+ **kwargs,
1041
1047
  ) -> Union[tuple, CausalLMOutput]:
1042
1048
  r"""
1043
1049
  labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
@@ -1149,6 +1155,7 @@ class HubertForSequenceClassification(HubertPreTrainedModel):
1149
1155
  output_hidden_states: Optional[bool] = None,
1150
1156
  return_dict: Optional[bool] = None,
1151
1157
  labels: Optional[torch.Tensor] = None,
1158
+ **kwargs,
1152
1159
  ) -> Union[tuple, SequenceClassifierOutput]:
1153
1160
  r"""
1154
1161
  input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -145,6 +145,10 @@ class HubertPreTrainedModel(PreTrainedModel):
145
145
  elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm1d)):
146
146
  init.zeros_(module.bias)
147
147
  init.ones_(module.weight)
148
+ if getattr(module, "running_mean", None) is not None:
149
+ init.zeros_(module.running_mean)
150
+ init.ones_(module.running_var)
151
+ init.zeros_(module.num_batches_tracked)
148
152
  elif isinstance(module, nn.Conv1d):
149
153
  if is_deepspeed_zero3_enabled():
150
154
  import deepspeed
@@ -226,6 +230,7 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
226
230
  output_attentions: Optional[bool] = None,
227
231
  output_hidden_states: Optional[bool] = None,
228
232
  return_dict: Optional[bool] = None,
233
+ **kwargs,
229
234
  ) -> Union[tuple, BaseModelOutput]:
230
235
  r"""
231
236
  mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -30,7 +30,7 @@ from transformers.cache_utils import Cache
30
30
  from ...activations import ACT2FN
31
31
  from ...cache_utils import DynamicCache
32
32
  from ...generation import GenerationMixin
33
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
33
+ from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
34
34
  from ...masking_utils import create_causal_mask
35
35
  from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
36
36
  from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -38,7 +38,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
38
38
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
39
39
  from ...processing_utils import Unpack
40
40
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
41
- from ...utils.generic import check_model_inputs
41
+ from ...utils.generic import check_model_inputs, maybe_autocast
42
42
  from .configuration_hunyuan_v1_dense import HunYuanDenseV1Config
43
43
 
44
44
 
@@ -153,6 +153,7 @@ def eager_attention_forward(
153
153
  return attn_output, attn_weights
154
154
 
155
155
 
156
+ @use_kernelized_func(apply_rotary_pos_emb)
156
157
  class HunYuanDenseV1Attention(nn.Module):
157
158
  """Multi-headed attention from 'Attention Is All You Need' paper"""
158
159
 
@@ -178,7 +179,6 @@ class HunYuanDenseV1Attention(nn.Module):
178
179
  self.o_proj = nn.Linear(
179
180
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
180
181
  )
181
- self.rotary_fn = apply_rotary_pos_emb
182
182
  self.query_layernorm = HunYuanDenseV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
183
183
  self.key_layernorm = HunYuanDenseV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
184
184
 
@@ -320,7 +320,7 @@ class HunYuanDenseV1RotaryEmbedding(nn.Module):
320
320
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
321
321
 
322
322
  self.register_buffer("inv_freq", inv_freq, persistent=False)
323
- self.original_inv_freq = inv_freq
323
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
324
324
 
325
325
  @staticmethod
326
326
  def compute_default_rope_parameters(
@@ -359,7 +359,7 @@ class HunYuanDenseV1RotaryEmbedding(nn.Module):
359
359
  position_ids_expanded = position_ids[:, None, :].float()
360
360
 
361
361
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
362
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
362
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
363
363
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
364
364
  emb = torch.cat((freqs, freqs), dim=-1)
365
365
  cos = emb.cos() * self.attention_scaling
@@ -148,7 +148,7 @@ class HunYuanDenseV1RotaryEmbedding(LlamaRotaryEmbedding):
148
148
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
149
149
 
150
150
  self.register_buffer("inv_freq", inv_freq, persistent=False)
151
- self.original_inv_freq = inv_freq
151
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
152
152
 
153
153
 
154
154
  class HunYuanDenseV1Model(LlamaModel):
@@ -6,7 +6,7 @@ from ...utils.import_utils import define_import_structure
6
6
 
7
7
  if TYPE_CHECKING:
8
8
  from .configuration_hunyuan_v1_moe import *
9
- from .modeling_hunyuan import *
9
+ from .modeling_hunyuan_v1_moe import *
10
10
  else:
11
11
  import sys
12
12
 
@@ -30,15 +30,20 @@ from ... import initialization as init
30
30
  from ...activations import ACT2FN
31
31
  from ...cache_utils import Cache, DynamicCache
32
32
  from ...generation import GenerationMixin
33
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
33
+ from ...integrations import (
34
+ use_experts_implementation,
35
+ use_kernel_forward_from_hub,
36
+ use_kernel_func_from_hub,
37
+ use_kernelized_func,
38
+ )
34
39
  from ...masking_utils import create_causal_mask
35
40
  from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
36
41
  from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
37
42
  from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
38
43
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
39
44
  from ...processing_utils import Unpack
40
- from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
41
- from ...utils.generic import check_model_inputs
45
+ from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
46
+ from ...utils.generic import check_model_inputs, maybe_autocast
42
47
  from .configuration_hunyuan_v1_moe import HunYuanMoEV1Config
43
48
 
44
49
 
@@ -152,6 +157,7 @@ def eager_attention_forward(
152
157
  return attn_output, attn_weights
153
158
 
154
159
 
160
+ @use_kernelized_func(apply_rotary_pos_emb)
155
161
  class HunYuanMoEV1Attention(nn.Module):
156
162
  """Multi-headed attention from 'Attention Is All You Need' paper"""
157
163
 
@@ -177,7 +183,6 @@ class HunYuanMoEV1Attention(nn.Module):
177
183
  self.o_proj = nn.Linear(
178
184
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
179
185
  )
180
- self.rotary_fn = apply_rotary_pos_emb
181
186
  self.query_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
182
187
  self.key_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
183
188
 
@@ -244,6 +249,7 @@ class HunYuanMoEV1Gate(nn.Module):
244
249
  return logits
245
250
 
246
251
 
252
+ @use_experts_implementation
247
253
  class HunYuanMoEV1Experts(nn.Module):
248
254
  """Collection of expert weights stored as 3D tensors."""
249
255
 
@@ -371,7 +377,9 @@ class HunYuanMoEV1PreTrainedModel(PreTrainedModel):
371
377
  _supports_flash_attn = True
372
378
  _supports_sdpa = True
373
379
  _supports_flex_attn = True
374
- _can_compile_fullgraph = False
380
+ _can_compile_fullgraph = (
381
+ is_grouped_mm_available()
382
+ ) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
375
383
  _supports_attention_backend = True
376
384
  _can_record_outputs = {
377
385
  "hidden_states": HunYuanMoEV1DecoderLayer,
@@ -413,7 +421,7 @@ class HunYuanMoEV1RotaryEmbedding(nn.Module):
413
421
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
414
422
 
415
423
  self.register_buffer("inv_freq", inv_freq, persistent=False)
416
- self.original_inv_freq = inv_freq
424
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
417
425
 
418
426
  @staticmethod
419
427
  def compute_default_rope_parameters(
@@ -452,7 +460,7 @@ class HunYuanMoEV1RotaryEmbedding(nn.Module):
452
460
  position_ids_expanded = position_ids[:, None, :].float()
453
461
 
454
462
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
455
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
463
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
456
464
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
457
465
  emb = torch.cat((freqs, freqs), dim=-1)
458
466
  cos = emb.cos() * self.attention_scaling
@@ -25,7 +25,7 @@ from ... import initialization as init
25
25
  from ...cache_utils import Cache
26
26
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
27
27
  from ...processing_utils import Unpack
28
- from ...utils import TransformersKwargs, logging
28
+ from ...utils import TransformersKwargs, is_grouped_mm_available, logging
29
29
  from ..hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1RotaryEmbedding
30
30
  from ..llama.modeling_llama import (
31
31
  LlamaAttention,
@@ -177,7 +177,9 @@ class HunYuanMoEV1DecoderLayer(LlamaDecoderLayer):
177
177
 
178
178
 
179
179
  class HunYuanMoEV1PreTrainedModel(LlamaPreTrainedModel):
180
- _can_compile_fullgraph = False
180
+ _can_compile_fullgraph = (
181
+ is_grouped_mm_available()
182
+ ) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
181
183
 
182
184
  @torch.no_grad()
183
185
  def _init_weights(self, module):
@@ -593,16 +593,32 @@ class IBertPreTrainedModel(PreTrainedModel):
593
593
  init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
594
594
  if module.bias is not None:
595
595
  init.zeros_(module.bias)
596
+ if getattr(module, "weight_integer", None) is not None:
597
+ init.zeros_(module.weight_integer)
598
+ init.zeros_(module.fc_scaling_factor)
599
+ if getattr(module, "bias_integer", None) is not None:
600
+ init.zeros_(module.bias_integer)
596
601
  elif isinstance(module, (QuantEmbedding, nn.Embedding)):
597
602
  init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
598
603
  # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
599
604
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
600
605
  init.zeros_(module.weight[module.padding_idx])
606
+ if getattr(module, "weight_scaling_factor", None) is not None:
607
+ init.zeros_(module.weight_scaling_factor)
608
+ init.zeros_(module.weight_integer)
601
609
  elif isinstance(module, (IntLayerNorm, nn.LayerNorm)):
602
610
  init.zeros_(module.bias)
603
611
  init.ones_(module.weight)
612
+ if getattr(module, "shift", None) is not None:
613
+ init.zeros_(module.shift)
604
614
  elif isinstance(module, IBertLMHead):
605
615
  init.zeros_(module.bias)
616
+ elif isinstance(module, IBertEmbeddings):
617
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
618
+ elif isinstance(module, QuantAct):
619
+ init.constant_(module.x_min, -1e-5)
620
+ init.constant_(module.x_max, 1e-5)
621
+ init.zeros_(module.act_scaling_factor)
606
622
 
607
623
  def resize_token_embeddings(self, new_num_tokens=None):
608
624
  raise NotImplementedError("`resize_token_embeddings` is not supported for I-BERT.")
@@ -653,6 +669,7 @@ class IBertModel(IBertPreTrainedModel):
653
669
  output_attentions: Optional[bool] = None,
654
670
  output_hidden_states: Optional[bool] = None,
655
671
  return_dict: Optional[bool] = None,
672
+ **kwargs,
656
673
  ) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, tuple[torch.FloatTensor]]:
657
674
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
658
675
  output_hidden_states = (
@@ -746,6 +763,7 @@ class IBertForMaskedLM(IBertPreTrainedModel):
746
763
  output_attentions: Optional[bool] = None,
747
764
  output_hidden_states: Optional[bool] = None,
748
765
  return_dict: Optional[bool] = None,
766
+ **kwargs,
749
767
  ) -> Union[MaskedLMOutput, tuple[torch.FloatTensor]]:
750
768
  r"""
751
769
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -836,6 +854,7 @@ class IBertForSequenceClassification(IBertPreTrainedModel):
836
854
  output_attentions: Optional[bool] = None,
837
855
  output_hidden_states: Optional[bool] = None,
838
856
  return_dict: Optional[bool] = None,
857
+ **kwargs,
839
858
  ) -> Union[SequenceClassifierOutput, tuple[torch.FloatTensor]]:
840
859
  r"""
841
860
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -916,6 +935,7 @@ class IBertForMultipleChoice(IBertPreTrainedModel):
916
935
  output_attentions: Optional[bool] = None,
917
936
  output_hidden_states: Optional[bool] = None,
918
937
  return_dict: Optional[bool] = None,
938
+ **kwargs,
919
939
  ) -> Union[MultipleChoiceModelOutput, tuple[torch.FloatTensor]]:
920
940
  r"""
921
941
  input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
@@ -1018,6 +1038,7 @@ class IBertForTokenClassification(IBertPreTrainedModel):
1018
1038
  output_attentions: Optional[bool] = None,
1019
1039
  output_hidden_states: Optional[bool] = None,
1020
1040
  return_dict: Optional[bool] = None,
1041
+ **kwargs,
1021
1042
  ) -> Union[TokenClassifierOutput, tuple[torch.FloatTensor]]:
1022
1043
  r"""
1023
1044
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1102,6 +1123,7 @@ class IBertForQuestionAnswering(IBertPreTrainedModel):
1102
1123
  output_attentions: Optional[bool] = None,
1103
1124
  output_hidden_states: Optional[bool] = None,
1104
1125
  return_dict: Optional[bool] = None,
1126
+ **kwargs,
1105
1127
  ) -> Union[QuestionAnsweringModelOutput, tuple[torch.FloatTensor]]:
1106
1128
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1107
1129
 
@@ -840,6 +840,7 @@ class IdeficsPreTrainedModel(PreTrainedModel):
840
840
  super()._init_weights(module)
841
841
  if isinstance(module, IdeficsVisionEmbeddings):
842
842
  init.normal_(module.class_embedding)
843
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
843
844
  elif isinstance(module, IdeficsGatedCrossAttentionLayer):
844
845
  if self.config.alpha_initializer == "zeros":
845
846
  init.zeros_(module.alpha_cross_attn)
@@ -852,6 +853,15 @@ class IdeficsPreTrainedModel(PreTrainedModel):
852
853
  init.normal_(module.alpha_dense, mean=0.0, std=self.config.alphas_initializer_range)
853
854
  elif isinstance(module, IdeficsPerceiverResampler):
854
855
  init.normal_(module.latents)
856
+ elif isinstance(module, IdeficsEmbedding):
857
+ inv_freq = 1.0 / (module.base ** (torch.arange(0, module.dim, 2) / module.dim))
858
+ init.copy_(module.inv_freq, inv_freq)
859
+ t = torch.arange(module.max_position_embeddings).type_as(inv_freq)
860
+ freqs = torch.einsum("i,j->ij", t, inv_freq)
861
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
862
+ emb = torch.cat((freqs, freqs), dim=-1)
863
+ init.copy_(module.cos_cached, emb.cos())
864
+ init.copy_(module.sin_cached, emb.sin())
855
865
 
856
866
 
857
867
  @auto_docstring
@@ -1107,31 +1117,15 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel, GenerationMixin):
1107
1117
  bias=False,
1108
1118
  partially_freeze=config.freeze_lm_head,
1109
1119
  )
1120
+ if config.additional_vocab_size > 0:
1121
+ self._tied_weights_keys = {
1122
+ "lm_head.weight": "model.embed_tokens.weight",
1123
+ "lm_head.additional_fc.weight": "model.embed_tokens.additional_embedding.weight",
1124
+ }
1110
1125
 
1111
1126
  # Initialize weights and apply final processing
1112
1127
  self.post_init()
1113
1128
 
1114
- def tie_weights(self, **kwargs):
1115
- """
1116
- Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
1117
- IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
1118
- """
1119
- output_embeddings = self.get_output_embeddings()
1120
- input_embeddings = self.get_input_embeddings()
1121
-
1122
- if getattr(self.config, "tie_word_embeddings", True):
1123
- output_embeddings.weight = input_embeddings.weight
1124
- if input_embeddings.num_additional_embeddings > 0:
1125
- assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
1126
- output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight
1127
-
1128
- if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
1129
- output_embeddings.out_features = input_embeddings.num_embeddings
1130
- if hasattr(output_embeddings, "out_additional_features") and hasattr(
1131
- input_embeddings, "num_additional_embeddings"
1132
- ):
1133
- output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings
1134
-
1135
1129
  @can_return_tuple
1136
1130
  @auto_docstring
1137
1131
  def forward(
@@ -452,6 +452,8 @@ class Idefics2VisionTransformer(Idefics2PreTrainedModel):
452
452
  self.encoder = Idefics2Encoder(config)
453
453
  self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
454
454
 
455
+ self.post_init()
456
+
455
457
  def get_input_embeddings(self):
456
458
  return self.embeddings
457
459
 
@@ -711,6 +713,8 @@ class Idefics2PerceiverResampler(Idefics2PreTrainedModel):
711
713
  self.layers = nn.ModuleList([Idefics2PerceiverLayer(config, idx) for idx in range(self.depth)])
712
714
  self.norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
713
715
 
716
+ self.post_init()
717
+
714
718
  @auto_docstring
715
719
  def forward(
716
720
  self,
@@ -1115,6 +1119,7 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
1115
1119
  pixel_attention_mask=None,
1116
1120
  image_hidden_states=None,
1117
1121
  logits_to_keep=None,
1122
+ is_first_iteration=False,
1118
1123
  **kwargs,
1119
1124
  ):
1120
1125
  # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
@@ -1130,10 +1135,11 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin)
1130
1135
  pixel_attention_mask=pixel_attention_mask,
1131
1136
  image_hidden_states=image_hidden_states,
1132
1137
  logits_to_keep=logits_to_keep,
1138
+ is_first_iteration=is_first_iteration,
1133
1139
  **kwargs,
1134
1140
  )
1135
1141
 
1136
- if image_hidden_states is not None or cache_position[0] != 0:
1142
+ if image_hidden_states is not None or not is_first_iteration:
1137
1143
  model_inputs["pixel_values"] = None
1138
1144
  model_inputs["pixel_attention_mask"] = None
1139
1145
 
@@ -458,6 +458,8 @@ class Idefics3VisionTransformer(Idefics3PreTrainedModel):
458
458
  self.patch_size = config.patch_size
459
459
  self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
460
460
 
461
+ self.post_init()
462
+
461
463
  # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionTransformer.get_input_embeddings
462
464
  def get_input_embeddings(self):
463
465
  return self.embeddings
@@ -887,6 +889,7 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
887
889
  pixel_attention_mask=None,
888
890
  image_hidden_states=None,
889
891
  logits_to_keep=None,
892
+ is_first_iteration=False,
890
893
  **kwargs,
891
894
  ):
892
895
  # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
@@ -902,10 +905,11 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin)
902
905
  pixel_attention_mask=pixel_attention_mask,
903
906
  image_hidden_states=image_hidden_states,
904
907
  logits_to_keep=logits_to_keep,
908
+ is_first_iteration=is_first_iteration,
905
909
  **kwargs,
906
910
  )
907
911
 
908
- if image_hidden_states is not None or cache_position[0] != 0:
912
+ if image_hidden_states is not None or not is_first_iteration:
909
913
  model_inputs["pixel_values"] = None
910
914
  model_inputs["pixel_attention_mask"] = None
911
915
 
@@ -164,12 +164,8 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
164
164
 
165
165
  input_ids = reorder_images(input_ids_grouped, grouped_images_index)
166
166
 
167
- return BatchFeature(
168
- data={"input_ids": torch.stack(input_ids, dim=0) if return_tensors else input_ids},
169
- tensor_type=return_tensors,
170
- )
167
+ return BatchFeature(data={"input_ids": input_ids}, tensor_type=return_tensors)
171
168
 
172
- pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values
173
169
  return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
174
170
 
175
171
  def to_dict(self):