transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -24,7 +24,8 @@ from ... import initialization as init
24
24
  from ...cache_utils import Cache
25
25
  from ...modeling_rope_utils import RopeParameters, dynamic_rope_update
26
26
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
27
- from ...utils import logging
27
+ from ...utils import is_grouped_mm_available, logging
28
+ from ...utils.generic import maybe_autocast
28
29
  from ..llama.configuration_llama import LlamaConfig
29
30
  from ..llama.modeling_llama import (
30
31
  LlamaDecoderLayer,
@@ -303,7 +304,7 @@ class DeepseekV2RotaryEmbedding(LlamaRotaryEmbedding):
303
304
  position_ids_expanded = position_ids[:, None, :].float()
304
305
 
305
306
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
306
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
307
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
307
308
  freqs = (inv_freq_expanded.to(x.device) @ position_ids_expanded).transpose(1, 2)
308
309
  freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # Convert to complex representation
309
310
  freqs_cis = freqs_cis * self.attention_scaling
@@ -368,7 +369,6 @@ class DeepseekV2Attention(nn.Module):
368
369
  past_key_values: Optional[Cache] = None,
369
370
  cache_position: Optional[torch.LongTensor] = None,
370
371
  position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
371
- position_ids: Optional[torch.Tensor] = None,
372
372
  **kwargs,
373
373
  ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
374
374
  batch_size, seq_length = hidden_states.shape[:-1]
@@ -437,7 +437,9 @@ class DeepseekV2DecoderLayer(LlamaDecoderLayer):
437
437
 
438
438
 
439
439
  class DeepseekV2PreTrainedModel(LlamaPreTrainedModel):
440
- _can_compile_fullgraph = False
440
+ _can_compile_fullgraph = (
441
+ is_grouped_mm_available()
442
+ ) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
441
443
 
442
444
  @torch.no_grad()
443
445
  def _init_weights(self, module):
@@ -16,7 +16,7 @@ from ... import initialization as init
16
16
  from ...activations import ACT2FN
17
17
  from ...cache_utils import Cache, DynamicCache
18
18
  from ...generation import GenerationMixin
19
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
19
+ from ...integrations import use_experts_implementation, use_kernel_forward_from_hub, use_kernel_func_from_hub
20
20
  from ...masking_utils import create_causal_mask
21
21
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
22
22
  from ...modeling_layers import (
@@ -28,8 +28,8 @@ from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
28
28
  from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
29
29
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
30
30
  from ...processing_utils import Unpack
31
- from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
32
- from ...utils.generic import check_model_inputs
31
+ from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
32
+ from ...utils.generic import check_model_inputs, maybe_autocast
33
33
  from .configuration_deepseek_v3 import DeepseekV3Config
34
34
 
35
35
 
@@ -71,7 +71,7 @@ class DeepseekV3RotaryEmbedding(nn.Module):
71
71
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
72
72
 
73
73
  self.register_buffer("inv_freq", inv_freq, persistent=False)
74
- self.original_inv_freq = inv_freq
74
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
75
75
 
76
76
  @staticmethod
77
77
  def compute_default_rope_parameters(
@@ -110,7 +110,7 @@ class DeepseekV3RotaryEmbedding(nn.Module):
110
110
  position_ids_expanded = position_ids[:, None, :].float()
111
111
 
112
112
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
113
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
113
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
114
114
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
115
115
  emb = torch.cat((freqs, freqs), dim=-1)
116
116
  cos = emb.cos() * self.attention_scaling
@@ -150,6 +150,7 @@ class DeepseekV3TopkRouter(nn.Module):
150
150
  return router_logits
151
151
 
152
152
 
153
+ @use_experts_implementation
153
154
  class DeepseekV3NaiveMoe(nn.Module):
154
155
  """Collection of expert weights stored as 3D tensors."""
155
156
 
@@ -157,7 +158,7 @@ class DeepseekV3NaiveMoe(nn.Module):
157
158
  super().__init__()
158
159
  self.num_experts = config.num_local_experts
159
160
  self.hidden_dim = config.hidden_size
160
- self.intermediate_dim = config.intermediate_size
161
+ self.intermediate_dim = config.moe_intermediate_size
161
162
  self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
162
163
  self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
163
164
  self.act_fn = ACT2FN[config.hidden_act]
@@ -542,18 +543,22 @@ class DeepseekV3PreTrainedModel(PreTrainedModel):
542
543
  _supports_flash_attn = True
543
544
  _supports_sdpa = True
544
545
  _supports_flex_attn = True
545
- _can_compile_fullgraph = False
546
+ _can_compile_fullgraph = (
547
+ is_grouped_mm_available()
548
+ ) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
546
549
  _supports_attention_backend = True
547
550
  _can_record_outputs = {
548
551
  "hidden_states": DeepseekV3DecoderLayer,
549
552
  "attentions": DeepseekV3Attention,
550
553
  }
554
+ _keep_in_fp32_modules_strict = ["e_score_correction_bias"]
551
555
 
552
556
  @torch.no_grad()
553
557
  def _init_weights(self, module):
554
558
  super()._init_weights(module)
555
559
  if isinstance(module, DeepseekV3TopkRouter):
556
560
  init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
561
+ init.zeros_(module.e_score_correction_bias)
557
562
  elif isinstance(module, DeepseekV3NaiveMoe):
558
563
  init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
559
564
  init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
@@ -12,7 +12,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
12
12
  from ...modeling_layers import GenericForSequenceClassification, GenericForTokenClassification
13
13
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
14
14
  from ...processing_utils import Unpack
15
- from ...utils import logging
15
+ from ...utils import is_grouped_mm_available, logging
16
16
  from ..llama.modeling_llama import (
17
17
  LlamaDecoderLayer,
18
18
  LlamaForCausalLM,
@@ -107,6 +107,7 @@ class DeepseekV3NaiveMoe(MixtralExperts):
107
107
  def __init__(self, config):
108
108
  super().__init__(config)
109
109
  self.num_experts = config.num_local_experts
110
+ self.intermediate_dim = config.moe_intermediate_size
110
111
 
111
112
 
112
113
  class DeepseekV3MoE(nn.Module):
@@ -303,13 +304,17 @@ class DeepseekV3DecoderLayer(LlamaDecoderLayer):
303
304
 
304
305
 
305
306
  class DeepseekV3PreTrainedModel(LlamaPreTrainedModel):
306
- _can_compile_fullgraph = False
307
+ _can_compile_fullgraph = (
308
+ is_grouped_mm_available()
309
+ ) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
310
+ _keep_in_fp32_modules_strict = ["e_score_correction_bias"]
307
311
 
308
312
  @torch.no_grad()
309
313
  def _init_weights(self, module):
310
314
  PreTrainedModel._init_weights(self, module)
311
315
  if isinstance(module, DeepseekV3TopkRouter):
312
316
  init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
317
+ init.zeros_(module.e_score_correction_bias)
313
318
  elif isinstance(module, DeepseekV3NaiveMoe):
314
319
  init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
315
320
  init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
@@ -171,7 +171,6 @@ class DeepseekVLImageProcessorFast(BaseImageProcessorFast):
171
171
  processed_images_grouped[shape] = stacked_images
172
172
 
173
173
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
174
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
175
174
 
176
175
  return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
177
176
 
@@ -196,7 +196,7 @@ class DeepseekVLModel(DeepseekVLPreTrainedModel):
196
196
  use_cache: Optional[bool] = None,
197
197
  logits_to_keep: Union[int, torch.Tensor] = 0,
198
198
  **kwargs,
199
- ):
199
+ ) -> DeepseekVLBaseModelOutputWithPast:
200
200
  if (input_ids is None) ^ (inputs_embeds is not None):
201
201
  raise ValueError(
202
202
  "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
@@ -268,7 +268,7 @@ class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMi
268
268
  use_cache: Optional[bool] = None,
269
269
  logits_to_keep: Union[int, torch.Tensor] = 0,
270
270
  **kwargs: Unpack[TransformersKwargs],
271
- ):
271
+ ) -> DeepseekVLCausalLMOutputWithPast:
272
272
  r"""
273
273
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
274
274
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -315,6 +315,7 @@ class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMi
315
315
  inputs_embeds=None,
316
316
  cache_position=None,
317
317
  logits_to_keep=None,
318
+ is_first_iteration=False,
318
319
  **kwargs,
319
320
  ):
320
321
  # Overwritten -- extra custom processing
@@ -326,12 +327,15 @@ class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMi
326
327
  attention_mask=attention_mask,
327
328
  cache_position=cache_position,
328
329
  logits_to_keep=logits_to_keep,
330
+ is_first_iteration=is_first_iteration,
329
331
  **kwargs,
330
332
  )
331
333
 
332
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
333
- # Otherwise we need pixel values to be passed to model
334
- if cache_position[0] == 0:
334
+ # Pixel values are used only in the first iteration if available
335
+ # In subsquent iterations, they are already merged with text and cached
336
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
337
+ # iteration with a question and cached system prompt (continue generate from cache)
338
+ if is_first_iteration or not kwargs.get("use_cache", True):
335
339
  model_inputs["pixel_values"] = pixel_values
336
340
 
337
341
  return model_inputs
@@ -134,6 +134,9 @@ class DeepseekVLAligner(nn.Module):
134
134
  class DeepseekVLPreTrainedModel(JanusPreTrainedModel):
135
135
  _no_split_modules = ["LlamaDecoderLayer"]
136
136
 
137
+ def _init_weights(self, module):
138
+ raise AttributeError("No need to inherit!")
139
+
137
140
 
138
141
  @auto_docstring
139
142
  class DeepseekVLModel(JanusModel):
@@ -207,9 +207,6 @@ class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
207
207
  )
208
208
  high_res_processed_images_grouped[shape] = stacked_high_res_images
209
209
  high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
210
- high_res_processed_images = (
211
- torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
212
- )
213
210
 
214
211
  resized_images_grouped = {}
215
212
  for shape, stacked_high_res_padded_images in high_res_padded_images.items():
@@ -233,7 +230,6 @@ class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
233
230
  )
234
231
  processed_images_grouped[shape] = stacked_images
235
232
  processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
236
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
237
233
 
238
234
  return BatchFeature(
239
235
  data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},
@@ -314,7 +314,7 @@ class DeepseekVLHybridModel(DeepseekVLHybridPreTrainedModel):
314
314
  use_cache: Optional[bool] = None,
315
315
  logits_to_keep: Union[int, torch.Tensor] = 0,
316
316
  **kwargs,
317
- ):
317
+ ) -> DeepseekVLHybridBaseModelOutputWithPast:
318
318
  if (input_ids is None) ^ (inputs_embeds is not None):
319
319
  raise ValueError(
320
320
  "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
@@ -424,7 +424,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel,
424
424
  use_cache: Optional[bool] = None,
425
425
  logits_to_keep: Union[int, torch.Tensor] = 0,
426
426
  **kwargs: Unpack[TransformersKwargs],
427
- ):
427
+ ) -> DeepseekVLHybridCausalLMOutputWithPast:
428
428
  r"""
429
429
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
430
430
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -473,6 +473,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel,
473
473
  attention_mask=None,
474
474
  cache_position=None,
475
475
  logits_to_keep=None,
476
+ is_first_iteration=False,
476
477
  **kwargs,
477
478
  ):
478
479
  model_inputs = super().prepare_inputs_for_generation(
@@ -482,12 +483,15 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel,
482
483
  attention_mask=attention_mask,
483
484
  cache_position=cache_position,
484
485
  logits_to_keep=logits_to_keep,
486
+ is_first_iteration=is_first_iteration,
485
487
  **kwargs,
486
488
  )
487
489
 
488
- if cache_position[0] == 0:
489
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
490
- # Otherwise we need pixel values to be passed to model
490
+ if is_first_iteration or not kwargs.get("use_cache", True):
491
+ # Pixel values are used only in the first iteration if available
492
+ # In subsquent iterations, they are already merged with text and cached
493
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
494
+ # iteration with a question and cached system prompt (continue generate from cache)
491
495
  model_inputs["pixel_values"] = pixel_values
492
496
  model_inputs["high_res_pixel_values"] = high_res_pixel_values
493
497
 
@@ -297,7 +297,7 @@ class DeepseekVLHybridModel(DeepseekVLModel):
297
297
  use_cache: Optional[bool] = None,
298
298
  logits_to_keep: Union[int, torch.Tensor] = 0,
299
299
  **kwargs,
300
- ):
300
+ ) -> DeepseekVLHybridBaseModelOutputWithPast:
301
301
  if (input_ids is None) ^ (inputs_embeds is not None):
302
302
  raise ValueError(
303
303
  "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
@@ -361,7 +361,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
361
361
  use_cache: Optional[bool] = None,
362
362
  logits_to_keep: Union[int, torch.Tensor] = 0,
363
363
  **kwargs: Unpack[TransformersKwargs],
364
- ):
364
+ ) -> DeepseekVLHybridCausalLMOutputWithPast:
365
365
  r"""
366
366
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
367
367
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -410,6 +410,7 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
410
410
  attention_mask=None,
411
411
  cache_position=None,
412
412
  logits_to_keep=None,
413
+ is_first_iteration=False,
413
414
  **kwargs,
414
415
  ):
415
416
  model_inputs = super().prepare_inputs_for_generation(
@@ -419,12 +420,15 @@ class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneratio
419
420
  attention_mask=attention_mask,
420
421
  cache_position=cache_position,
421
422
  logits_to_keep=logits_to_keep,
423
+ is_first_iteration=is_first_iteration,
422
424
  **kwargs,
423
425
  )
424
426
 
425
- if cache_position[0] == 0:
426
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
427
- # Otherwise we need pixel values to be passed to model
427
+ if is_first_iteration or not kwargs.get("use_cache", True):
428
+ # Pixel values are used only in the first iteration if available
429
+ # In subsquent iterations, they are already merged with text and cached
430
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
431
+ # iteration with a question and cached system prompt (continue generate from cache)
428
432
  model_inputs["pixel_values"] = pixel_values
429
433
  model_inputs["high_res_pixel_values"] = high_res_pixel_values
430
434
 
@@ -888,9 +892,6 @@ class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
888
892
  )
889
893
  high_res_processed_images_grouped[shape] = stacked_high_res_images
890
894
  high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
891
- high_res_processed_images = (
892
- torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
893
- )
894
895
 
895
896
  resized_images_grouped = {}
896
897
  for shape, stacked_high_res_padded_images in high_res_padded_images.items():
@@ -914,7 +915,6 @@ class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
914
915
  )
915
916
  processed_images_grouped[shape] = stacked_images
916
917
  processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
917
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
918
918
 
919
919
  return BatchFeature(
920
920
  data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},
@@ -37,7 +37,7 @@ class DeformableDetrConfig(PreTrainedConfig):
37
37
  use_timm_backbone (`bool`, *optional*, defaults to `True`):
38
38
  Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
39
39
  API.
40
- backbone_config (`PreTrainedConfig` or `dict`, *optional*):
40
+ backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `ResNetConfig()`):
41
41
  The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
42
42
  case it will default to `ResNetConfig()`.
43
43
  num_channels (`int`, *optional*, defaults to 3):
@@ -269,8 +269,8 @@ class DeformableDetrConfig(PreTrainedConfig):
269
269
  self.eos_coefficient = eos_coefficient
270
270
  self.focal_alpha = focal_alpha
271
271
  self.disable_custom_kernels = disable_custom_kernels
272
+
272
273
  super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
273
- self.tie_encoder_decoder = True
274
274
 
275
275
 
276
276
  __all__ = ["DeformableDetrConfig"]
@@ -956,7 +956,7 @@ class DeformableDetrPreTrainedModel(PreTrainedModel):
956
956
  init.constant_(module.value_proj.bias, 0.0)
957
957
  init.xavier_uniform_(module.output_proj.weight)
958
958
  init.constant_(module.output_proj.bias, 0.0)
959
- elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
959
+ elif isinstance(module, (nn.Linear, nn.Conv2d)):
960
960
  init.normal_(module.weight, mean=0.0, std=std)
961
961
  if module.bias is not None:
962
962
  init.zeros_(module.bias)
@@ -1036,6 +1036,7 @@ class DeformableDetrEncoder(DeformableDetrPreTrainedModel):
1036
1036
  output_attentions=None,
1037
1037
  output_hidden_states=None,
1038
1038
  return_dict=None,
1039
+ **kwargs,
1039
1040
  ):
1040
1041
  r"""
1041
1042
  Args:
@@ -1151,6 +1152,7 @@ class DeformableDetrDecoder(DeformableDetrPreTrainedModel):
1151
1152
  output_attentions=None,
1152
1153
  output_hidden_states=None,
1153
1154
  return_dict=None,
1155
+ **kwargs,
1154
1156
  ):
1155
1157
  r"""
1156
1158
  Args:
@@ -1468,6 +1470,7 @@ class DeformableDetrModel(DeformableDetrPreTrainedModel):
1468
1470
  output_attentions: Optional[bool] = None,
1469
1471
  output_hidden_states: Optional[bool] = None,
1470
1472
  return_dict: Optional[bool] = None,
1473
+ **kwargs,
1471
1474
  ) -> Union[tuple[torch.FloatTensor], DeformableDetrModelOutput]:
1472
1475
  r"""
1473
1476
  decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
@@ -1745,6 +1748,7 @@ class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
1745
1748
  output_attentions: Optional[bool] = None,
1746
1749
  output_hidden_states: Optional[bool] = None,
1747
1750
  return_dict: Optional[bool] = None,
1751
+ **kwargs,
1748
1752
  ) -> Union[tuple[torch.FloatTensor], DeformableDetrObjectDetectionOutput]:
1749
1753
  r"""
1750
1754
  decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
@@ -34,9 +34,8 @@ class DepthAnythingConfig(PreTrainedConfig):
34
34
  documentation from [`PreTrainedConfig`] for more information.
35
35
 
36
36
  Args:
37
- backbone_config (`Union[dict[str, Any], PreTrainedConfig]`, *optional*):
38
- The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to
39
- leverage the [`AutoBackbone`] API.
37
+ backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `Dinov2Config()`):
38
+ The configuration of the backbone model.
40
39
  backbone (`str`, *optional*):
41
40
  Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
42
41
  will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
@@ -337,6 +337,7 @@ class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel):
337
337
  output_attentions: Optional[bool] = None,
338
338
  output_hidden_states: Optional[bool] = None,
339
339
  return_dict: Optional[bool] = None,
340
+ **kwargs,
340
341
  ) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]:
341
342
  r"""
342
343
  labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
@@ -94,7 +94,6 @@ class DepthProImageProcessorFast(BaseImageProcessorFast):
94
94
  processed_images_grouped[shape] = stacked_images
95
95
 
96
96
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
97
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
98
97
 
99
98
  return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
100
99
 
@@ -645,6 +645,7 @@ class DepthProModel(DepthProPreTrainedModel):
645
645
  output_attentions: Optional[bool] = None,
646
646
  output_hidden_states: Optional[bool] = None,
647
647
  return_dict: Optional[bool] = None,
648
+ **kwargs,
648
649
  ) -> Union[tuple, DepthProOutput]:
649
650
  r"""
650
651
  Examples:
@@ -1027,6 +1028,7 @@ class DepthProForDepthEstimation(DepthProPreTrainedModel):
1027
1028
  output_attentions: Optional[bool] = None,
1028
1029
  output_hidden_states: Optional[bool] = None,
1029
1030
  return_dict: Optional[bool] = None,
1031
+ **kwargs,
1030
1032
  ) -> Union[tuple[torch.Tensor], DepthProDepthEstimatorOutput]:
1031
1033
  r"""
1032
1034
  labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
@@ -37,7 +37,7 @@ class DetrConfig(PreTrainedConfig):
37
37
  use_timm_backbone (`bool`, *optional*, defaults to `True`):
38
38
  Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
39
39
  API.
40
- backbone_config (`PreTrainedConfig` or `dict`, *optional*):
40
+ backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `ResNetConfig()`):
41
41
  The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
42
42
  case it will default to `ResNetConfig()`.
43
43
  num_channels (`int`, *optional*, defaults to 3):
@@ -741,7 +741,7 @@ class DetrPreTrainedModel(PreTrainedModel):
741
741
  elif isinstance(module, DetrLearnedPositionEmbedding):
742
742
  init.uniform_(module.row_embeddings.weight)
743
743
  init.uniform_(module.column_embeddings.weight)
744
- if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
744
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
745
745
  init.normal_(module.weight, mean=0.0, std=std)
746
746
  if module.bias is not None:
747
747
  init.zeros_(module.bias)
@@ -750,6 +750,9 @@ class DetrPreTrainedModel(PreTrainedModel):
750
750
  # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
751
751
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
752
752
  init.zeros_(module.weight[module.padding_idx])
753
+ elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
754
+ init.ones_(module.weight)
755
+ init.zeros_(module.bias)
753
756
 
754
757
 
755
758
  class DetrEncoder(DetrPreTrainedModel):
@@ -788,6 +791,7 @@ class DetrEncoder(DetrPreTrainedModel):
788
791
  output_attentions=None,
789
792
  output_hidden_states=None,
790
793
  return_dict=None,
794
+ **kwargs,
791
795
  ):
792
796
  r"""
793
797
  Args:
@@ -905,6 +909,7 @@ class DetrDecoder(DetrPreTrainedModel):
905
909
  output_attentions=None,
906
910
  output_hidden_states=None,
907
911
  return_dict=None,
912
+ **kwargs,
908
913
  ):
909
914
  r"""
910
915
  Args:
@@ -1078,6 +1083,7 @@ class DetrModel(DetrPreTrainedModel):
1078
1083
  output_attentions: Optional[bool] = None,
1079
1084
  output_hidden_states: Optional[bool] = None,
1080
1085
  return_dict: Optional[bool] = None,
1086
+ **kwargs,
1081
1087
  ) -> Union[tuple[torch.FloatTensor], DetrModelOutput]:
1082
1088
  r"""
1083
1089
  decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
@@ -1258,6 +1264,7 @@ class DetrForObjectDetection(DetrPreTrainedModel):
1258
1264
  output_attentions: Optional[bool] = None,
1259
1265
  output_hidden_states: Optional[bool] = None,
1260
1266
  return_dict: Optional[bool] = None,
1267
+ **kwargs,
1261
1268
  ) -> Union[tuple[torch.FloatTensor], DetrObjectDetectionOutput]:
1262
1269
  r"""
1263
1270
  decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
@@ -1404,6 +1411,7 @@ class DetrForSegmentation(DetrPreTrainedModel):
1404
1411
  output_attentions: Optional[bool] = None,
1405
1412
  output_hidden_states: Optional[bool] = None,
1406
1413
  return_dict: Optional[bool] = None,
1414
+ **kwargs,
1407
1415
  ) -> Union[tuple[torch.FloatTensor], DetrSegmentationOutput]:
1408
1416
  r"""
1409
1417
  decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
@@ -1452,8 +1460,12 @@ class DetrForSegmentation(DetrPreTrainedModel):
1452
1460
 
1453
1461
  >>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
1454
1462
  >>> panoptic_seg = result[0]["segmentation"]
1463
+ >>> panoptic_seg.shape
1464
+ torch.Size([300, 500])
1455
1465
  >>> # Get prediction score and segment_id to class_id mapping of each segment
1456
1466
  >>> panoptic_segments_info = result[0]["segments_info"]
1467
+ >>> len(panoptic_segments_info)
1468
+ 5
1457
1469
  ```"""
1458
1470
 
1459
1471
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -110,11 +110,9 @@ class DiaGenerationMixin(GenerationMixin):
110
110
  return merged_processors
111
111
 
112
112
  def _prepare_generation_config(
113
- self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: Any
113
+ self, generation_config: Optional[GenerationConfig], **kwargs: Any
114
114
  ) -> tuple[GenerationConfig, dict]:
115
- generation_config, model_kwargs = super()._prepare_generation_config(
116
- generation_config, use_model_defaults, **kwargs
117
- )
115
+ generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs)
118
116
 
119
117
  # We allow generation up to max length + max delay pattern
120
118
  # (will revert back to max length after generation)
@@ -260,7 +258,6 @@ class DiaGenerationMixin(GenerationMixin):
260
258
  streamer: Optional["BaseStreamer"] = None,
261
259
  negative_prompt_ids: Optional[torch.Tensor] = None,
262
260
  negative_prompt_attention_mask: Optional[torch.Tensor] = None,
263
- use_model_defaults: Optional[bool] = None,
264
261
  custom_generate: Optional[str] = None,
265
262
  **kwargs,
266
263
  ):
@@ -273,9 +270,7 @@ class DiaGenerationMixin(GenerationMixin):
273
270
  assistant_model,
274
271
  streamer,
275
272
  )
276
- generation_config, model_kwargs = self._prepare_generation_config(
277
- generation_config, use_model_defaults, **kwargs
278
- )
273
+ generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
279
274
  generation_mode = generation_config.get_generation_mode(assistant_model)
280
275
 
281
276
  if generation_mode not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
@@ -425,7 +420,6 @@ class DiaGenerationMixin(GenerationMixin):
425
420
  streamer: Optional["BaseStreamer"] = None,
426
421
  negative_prompt_ids: Optional[torch.Tensor] = None,
427
422
  negative_prompt_attention_mask: Optional[torch.Tensor] = None,
428
- use_model_defaults: Optional[bool] = None,
429
423
  custom_generate: Optional[str] = None,
430
424
  **kwargs,
431
425
  ) -> Union[GenerateOutput, torch.LongTensor]:
@@ -445,7 +439,6 @@ class DiaGenerationMixin(GenerationMixin):
445
439
  streamer=streamer,
446
440
  negative_prompt_ids=negative_prompt_ids,
447
441
  negative_prompt_attention_mask=negative_prompt_attention_mask,
448
- use_model_defaults=use_model_defaults,
449
442
  custom_generate=custom_generate,
450
443
  **kwargs,
451
444
  )