transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. transformers/__init__.py +30 -3
  2. transformers/cli/serve.py +47 -17
  3. transformers/conversion_mapping.py +15 -2
  4. transformers/convert_slow_tokenizer.py +225 -10
  5. transformers/core_model_loading.py +196 -135
  6. transformers/data/data_collator.py +12 -4
  7. transformers/dependency_versions_table.py +1 -2
  8. transformers/dynamic_module_utils.py +1 -2
  9. transformers/feature_extraction_utils.py +1 -2
  10. transformers/file_utils.py +0 -1
  11. transformers/generation/__init__.py +11 -1
  12. transformers/generation/configuration_utils.py +3 -2
  13. transformers/generation/continuous_batching/__init__.py +4 -0
  14. transformers/generation/continuous_batching/continuous_api.py +134 -79
  15. transformers/image_processing_base.py +1 -2
  16. transformers/integrations/__init__.py +4 -2
  17. transformers/integrations/accelerate.py +15 -3
  18. transformers/integrations/aqlm.py +38 -66
  19. transformers/integrations/awq.py +48 -514
  20. transformers/integrations/bitnet.py +45 -100
  21. transformers/integrations/bitsandbytes.py +79 -191
  22. transformers/integrations/deepspeed.py +1 -0
  23. transformers/integrations/eetq.py +84 -79
  24. transformers/integrations/fbgemm_fp8.py +191 -145
  25. transformers/integrations/finegrained_fp8.py +236 -193
  26. transformers/integrations/fp_quant.py +92 -0
  27. transformers/integrations/ggml.py +11 -1
  28. transformers/integrations/higgs.py +40 -62
  29. transformers/integrations/hub_kernels.py +42 -3
  30. transformers/integrations/integration_utils.py +10 -0
  31. transformers/integrations/mxfp4.py +25 -65
  32. transformers/integrations/peft.py +7 -29
  33. transformers/integrations/quanto.py +73 -55
  34. transformers/integrations/quark.py +55 -0
  35. transformers/integrations/spqr.py +44 -90
  36. transformers/integrations/torchao.py +32 -38
  37. transformers/integrations/vptq.py +42 -59
  38. transformers/modelcard.py +1 -2
  39. transformers/modeling_gguf_pytorch_utils.py +8 -0
  40. transformers/modeling_rope_utils.py +30 -6
  41. transformers/modeling_utils.py +116 -112
  42. transformers/models/__init__.py +3 -0
  43. transformers/models/afmoe/modeling_afmoe.py +4 -4
  44. transformers/models/albert/tokenization_albert.py +6 -12
  45. transformers/models/align/modeling_align.py +2 -0
  46. transformers/models/altclip/modeling_altclip.py +4 -0
  47. transformers/models/apertus/modeling_apertus.py +4 -4
  48. transformers/models/arcee/modeling_arcee.py +4 -4
  49. transformers/models/aria/modeling_aria.py +4 -4
  50. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  51. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  52. transformers/models/auto/configuration_auto.py +11 -0
  53. transformers/models/auto/feature_extraction_auto.py +2 -0
  54. transformers/models/auto/image_processing_auto.py +1 -0
  55. transformers/models/auto/modeling_auto.py +6 -0
  56. transformers/models/auto/processing_auto.py +18 -10
  57. transformers/models/auto/tokenization_auto.py +74 -472
  58. transformers/models/autoformer/modeling_autoformer.py +4 -0
  59. transformers/models/bamba/modeling_bamba.py +4 -3
  60. transformers/models/bark/modeling_bark.py +2 -0
  61. transformers/models/bart/modeling_bart.py +7 -0
  62. transformers/models/barthez/tokenization_barthez.py +5 -10
  63. transformers/models/beit/modeling_beit.py +6 -1
  64. transformers/models/bert/tokenization_bert.py +8 -21
  65. transformers/models/big_bird/modeling_big_bird.py +6 -0
  66. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  67. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
  68. transformers/models/biogpt/modeling_biogpt.py +2 -0
  69. transformers/models/biogpt/modular_biogpt.py +2 -0
  70. transformers/models/bit/modeling_bit.py +11 -2
  71. transformers/models/bitnet/modeling_bitnet.py +4 -4
  72. transformers/models/blenderbot/modeling_blenderbot.py +5 -0
  73. transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
  74. transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
  75. transformers/models/blip/modeling_blip_text.py +2 -0
  76. transformers/models/blip_2/modeling_blip_2.py +2 -1
  77. transformers/models/bloom/modeling_bloom.py +4 -0
  78. transformers/models/blt/modeling_blt.py +2 -2
  79. transformers/models/blt/modular_blt.py +2 -2
  80. transformers/models/bridgetower/modeling_bridgetower.py +5 -1
  81. transformers/models/bros/modeling_bros.py +4 -0
  82. transformers/models/camembert/tokenization_camembert.py +8 -12
  83. transformers/models/canine/modeling_canine.py +5 -0
  84. transformers/models/chameleon/modeling_chameleon.py +2 -1
  85. transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
  86. transformers/models/clap/modeling_clap.py +5 -0
  87. transformers/models/clip/tokenization_clip.py +22 -44
  88. transformers/models/clipseg/modeling_clipseg.py +5 -0
  89. transformers/models/clvp/modeling_clvp.py +5 -0
  90. transformers/models/clvp/tokenization_clvp.py +1 -63
  91. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  92. transformers/models/codegen/tokenization_codegen.py +14 -43
  93. transformers/models/cohere/modeling_cohere.py +4 -3
  94. transformers/models/cohere/modular_cohere.py +2 -1
  95. transformers/models/cohere/tokenization_cohere.py +12 -42
  96. transformers/models/cohere2/modeling_cohere2.py +7 -6
  97. transformers/models/cohere2/modular_cohere2.py +5 -5
  98. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
  99. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  100. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  101. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  102. transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
  103. transformers/models/convbert/modeling_convbert.py +6 -0
  104. transformers/models/convnext/modeling_convnext.py +2 -4
  105. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  106. transformers/models/csm/modeling_csm.py +4 -3
  107. transformers/models/ctrl/modeling_ctrl.py +1 -0
  108. transformers/models/cvt/modeling_cvt.py +2 -0
  109. transformers/models/cwm/modeling_cwm.py +4 -4
  110. transformers/models/d_fine/modeling_d_fine.py +2 -0
  111. transformers/models/d_fine/modular_d_fine.py +1 -0
  112. transformers/models/dab_detr/modeling_dab_detr.py +4 -0
  113. transformers/models/dac/modeling_dac.py +2 -2
  114. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  115. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  116. transformers/models/dbrx/modeling_dbrx.py +2 -2
  117. transformers/models/deberta/modeling_deberta.py +5 -0
  118. transformers/models/deberta/tokenization_deberta.py +11 -20
  119. transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
  120. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  121. transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
  122. transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
  123. transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
  124. transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
  125. transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
  126. transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
  127. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  128. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  129. transformers/models/detr/modeling_detr.py +5 -0
  130. transformers/models/dia/modeling_dia.py +4 -3
  131. transformers/models/dia/modular_dia.py +0 -1
  132. transformers/models/diffllama/modeling_diffllama.py +2 -2
  133. transformers/models/dinat/modeling_dinat.py +3 -0
  134. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  135. transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
  136. transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
  137. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  138. transformers/models/doge/modeling_doge.py +2 -3
  139. transformers/models/doge/modular_doge.py +0 -1
  140. transformers/models/donut/modeling_donut_swin.py +2 -0
  141. transformers/models/dots1/modeling_dots1.py +10 -7
  142. transformers/models/dots1/modular_dots1.py +5 -3
  143. transformers/models/dpr/modeling_dpr.py +5 -0
  144. transformers/models/dpr/tokenization_dpr.py +12 -0
  145. transformers/models/edgetam/modeling_edgetam.py +1 -1
  146. transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
  147. transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
  148. transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
  149. transformers/models/efficientnet/modeling_efficientnet.py +2 -0
  150. transformers/models/emu3/modeling_emu3.py +4 -4
  151. transformers/models/eomt/image_processing_eomt.py +13 -1
  152. transformers/models/eomt/image_processing_eomt_fast.py +14 -2
  153. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  154. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  155. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
  156. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
  157. transformers/models/esm/modeling_esmfold.py +5 -4
  158. transformers/models/evolla/modeling_evolla.py +4 -4
  159. transformers/models/exaone4/modeling_exaone4.py +2 -2
  160. transformers/models/exaone4/modular_exaone4.py +0 -1
  161. transformers/models/falcon/modeling_falcon.py +6 -1
  162. transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
  163. transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
  164. transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
  165. transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
  166. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  167. transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
  168. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  169. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
  170. transformers/models/flaubert/modeling_flaubert.py +7 -0
  171. transformers/models/flava/modeling_flava.py +6 -1
  172. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
  173. transformers/models/florence2/modeling_florence2.py +2 -1
  174. transformers/models/florence2/modular_florence2.py +2 -1
  175. transformers/models/fnet/modeling_fnet.py +7 -0
  176. transformers/models/focalnet/modeling_focalnet.py +4 -0
  177. transformers/models/fsmt/modeling_fsmt.py +2 -0
  178. transformers/models/funnel/modeling_funnel.py +8 -0
  179. transformers/models/funnel/tokenization_funnel.py +17 -24
  180. transformers/models/fuyu/processing_fuyu.py +3 -3
  181. transformers/models/gemma/modeling_gemma.py +4 -4
  182. transformers/models/gemma/tokenization_gemma.py +10 -27
  183. transformers/models/gemma2/modeling_gemma2.py +4 -4
  184. transformers/models/gemma2/modular_gemma2.py +2 -1
  185. transformers/models/gemma3/modeling_gemma3.py +14 -84
  186. transformers/models/gemma3/modular_gemma3.py +12 -81
  187. transformers/models/gemma3n/modeling_gemma3n.py +18 -209
  188. transformers/models/gemma3n/modular_gemma3n.py +17 -59
  189. transformers/models/git/modeling_git.py +2 -0
  190. transformers/models/glm/modeling_glm.py +4 -4
  191. transformers/models/glm4/modeling_glm4.py +4 -4
  192. transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
  193. transformers/models/glm4v/configuration_glm4v.py +3 -1
  194. transformers/models/glm4v/modeling_glm4v.py +3 -3
  195. transformers/models/glm4v/modular_glm4v.py +6 -4
  196. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  197. transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
  198. transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
  199. transformers/models/glpn/modeling_glpn.py +2 -0
  200. transformers/models/gpt2/modeling_gpt2.py +5 -1
  201. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  202. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
  203. transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
  204. transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
  205. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  206. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  207. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
  208. transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
  209. transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
  210. transformers/models/gptj/modeling_gptj.py +3 -0
  211. transformers/models/granite/modeling_granite.py +4 -4
  212. transformers/models/granitemoe/modeling_granitemoe.py +4 -6
  213. transformers/models/granitemoe/modular_granitemoe.py +0 -2
  214. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
  215. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
  216. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
  217. transformers/models/groupvit/modeling_groupvit.py +3 -0
  218. transformers/models/helium/modeling_helium.py +4 -3
  219. transformers/models/herbert/tokenization_herbert.py +9 -25
  220. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
  221. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
  222. transformers/models/hiera/modeling_hiera.py +4 -0
  223. transformers/models/hubert/modeling_hubert.py +3 -0
  224. transformers/models/hubert/modular_hubert.py +1 -0
  225. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
  226. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
  227. transformers/models/ibert/modeling_ibert.py +6 -0
  228. transformers/models/idefics/modeling_idefics.py +5 -21
  229. transformers/models/imagegpt/modeling_imagegpt.py +2 -1
  230. transformers/models/informer/modeling_informer.py +4 -0
  231. transformers/models/informer/modular_informer.py +1 -0
  232. transformers/models/internvl/modeling_internvl.py +2 -4
  233. transformers/models/internvl/modular_internvl.py +2 -4
  234. transformers/models/jamba/modeling_jamba.py +2 -2
  235. transformers/models/janus/modeling_janus.py +1 -0
  236. transformers/models/janus/modular_janus.py +1 -0
  237. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  238. transformers/models/kosmos2/modeling_kosmos2.py +1 -0
  239. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
  240. transformers/models/lasr/__init__.py +29 -0
  241. transformers/models/lasr/configuration_lasr.py +244 -0
  242. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  243. transformers/models/lasr/modeling_lasr.py +729 -0
  244. transformers/models/lasr/modular_lasr.py +569 -0
  245. transformers/models/lasr/processing_lasr.py +96 -0
  246. transformers/models/lasr/tokenization_lasr.py +186 -0
  247. transformers/models/layoutlm/modeling_layoutlm.py +5 -0
  248. transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
  249. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
  250. transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
  251. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  252. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  253. transformers/models/led/modeling_led.py +6 -0
  254. transformers/models/levit/modeling_levit.py +3 -0
  255. transformers/models/lfm2/modeling_lfm2.py +4 -5
  256. transformers/models/lfm2/modular_lfm2.py +0 -1
  257. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
  258. transformers/models/lightglue/modeling_lightglue.py +3 -1
  259. transformers/models/lightglue/modular_lightglue.py +1 -0
  260. transformers/models/lilt/modeling_lilt.py +4 -0
  261. transformers/models/llama/modeling_llama.py +4 -4
  262. transformers/models/llama/tokenization_llama.py +15 -43
  263. transformers/models/llama4/modeling_llama4.py +3 -2
  264. transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
  265. transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
  266. transformers/models/longformer/modeling_longformer.py +6 -0
  267. transformers/models/longt5/modeling_longt5.py +4 -0
  268. transformers/models/luke/modeling_luke.py +9 -0
  269. transformers/models/luke/tokenization_luke.py +11 -38
  270. transformers/models/lxmert/modeling_lxmert.py +2 -0
  271. transformers/models/m2m_100/modeling_m2m_100.py +4 -0
  272. transformers/models/mamba/modeling_mamba.py +14 -22
  273. transformers/models/marian/modeling_marian.py +5 -0
  274. transformers/models/markuplm/modeling_markuplm.py +4 -0
  275. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  276. transformers/models/mask2former/modeling_mask2former.py +2 -0
  277. transformers/models/maskformer/modeling_maskformer.py +2 -0
  278. transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
  279. transformers/models/mbart/modeling_mbart.py +7 -0
  280. transformers/models/mbart/tokenization_mbart.py +11 -52
  281. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  282. transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
  283. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  284. transformers/models/mimi/modeling_mimi.py +3 -1
  285. transformers/models/minimax/modeling_minimax.py +4 -4
  286. transformers/models/ministral/modeling_ministral.py +4 -4
  287. transformers/models/ministral3/configuration_ministral3.py +1 -1
  288. transformers/models/ministral3/modeling_ministral3.py +4 -3
  289. transformers/models/mistral/modeling_mistral.py +4 -3
  290. transformers/models/mixtral/modeling_mixtral.py +4 -4
  291. transformers/models/mllama/modeling_mllama.py +2 -2
  292. transformers/models/mluke/tokenization_mluke.py +6 -6
  293. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
  294. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  295. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  296. transformers/models/mobilevit/modeling_mobilevit.py +3 -0
  297. transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
  298. transformers/models/modernbert/modeling_modernbert.py +4 -1
  299. transformers/models/modernbert/modular_modernbert.py +2 -0
  300. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
  301. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
  302. transformers/models/moonshine/modeling_moonshine.py +4 -2
  303. transformers/models/moshi/modeling_moshi.py +5 -2
  304. transformers/models/mpnet/modeling_mpnet.py +5 -0
  305. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  306. transformers/models/mpt/modeling_mpt.py +2 -0
  307. transformers/models/mra/modeling_mra.py +6 -0
  308. transformers/models/mt5/modeling_mt5.py +7 -0
  309. transformers/models/musicgen/modeling_musicgen.py +2 -0
  310. transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
  311. transformers/models/mvp/modeling_mvp.py +7 -0
  312. transformers/models/nanochat/modeling_nanochat.py +4 -4
  313. transformers/models/nemotron/modeling_nemotron.py +4 -2
  314. transformers/models/nllb/tokenization_nllb.py +8 -22
  315. transformers/models/nougat/tokenization_nougat.py +11 -59
  316. transformers/models/nystromformer/modeling_nystromformer.py +6 -0
  317. transformers/models/olmo/modeling_olmo.py +4 -4
  318. transformers/models/olmo/modular_olmo.py +2 -2
  319. transformers/models/olmo2/modeling_olmo2.py +4 -5
  320. transformers/models/olmo2/modular_olmo2.py +0 -1
  321. transformers/models/olmo3/modeling_olmo3.py +4 -4
  322. transformers/models/olmoe/modeling_olmoe.py +4 -4
  323. transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
  324. transformers/models/oneformer/modeling_oneformer.py +4 -1
  325. transformers/models/openai/modeling_openai.py +3 -0
  326. transformers/models/openai/tokenization_openai.py +10 -46
  327. transformers/models/opt/modeling_opt.py +2 -0
  328. transformers/models/owlv2/modeling_owlv2.py +4 -0
  329. transformers/models/owlvit/modeling_owlvit.py +4 -0
  330. transformers/models/paddleocr_vl/__init__.py +32 -0
  331. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  332. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
  333. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  334. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
  335. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
  336. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  337. transformers/models/parakeet/configuration_parakeet.py +4 -6
  338. transformers/models/parakeet/modeling_parakeet.py +9 -6
  339. transformers/models/parakeet/modular_parakeet.py +2 -2
  340. transformers/models/parakeet/processing_parakeet.py +1 -0
  341. transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
  342. transformers/models/patchtst/modeling_patchtst.py +20 -2
  343. transformers/models/pegasus/modeling_pegasus.py +5 -0
  344. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  345. transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
  346. transformers/models/perceiver/modeling_perceiver.py +8 -0
  347. transformers/models/persimmon/modeling_persimmon.py +2 -1
  348. transformers/models/phi/modeling_phi.py +4 -5
  349. transformers/models/phi/modular_phi.py +0 -1
  350. transformers/models/phi3/modeling_phi3.py +2 -1
  351. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
  352. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
  353. transformers/models/phimoe/modeling_phimoe.py +4 -4
  354. transformers/models/phimoe/modular_phimoe.py +2 -2
  355. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  356. transformers/models/pixtral/modeling_pixtral.py +2 -1
  357. transformers/models/plbart/modeling_plbart.py +6 -0
  358. transformers/models/plbart/modular_plbart.py +2 -0
  359. transformers/models/plbart/tokenization_plbart.py +0 -2
  360. transformers/models/poolformer/modeling_poolformer.py +2 -0
  361. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  362. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  363. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  364. transformers/models/prophetnet/modeling_prophetnet.py +3 -0
  365. transformers/models/pvt/modeling_pvt.py +2 -0
  366. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  367. transformers/models/qwen2/modeling_qwen2.py +4 -4
  368. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  369. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  370. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
  371. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
  372. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  373. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
  374. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
  375. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
  376. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  377. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  378. transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
  379. transformers/models/qwen3/modeling_qwen3.py +4 -4
  380. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  381. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
  382. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
  383. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
  384. transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
  385. transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
  386. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
  387. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
  388. transformers/models/rag/modeling_rag.py +1 -0
  389. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
  390. transformers/models/reformer/modeling_reformer.py +4 -0
  391. transformers/models/reformer/tokenization_reformer.py +11 -28
  392. transformers/models/regnet/modeling_regnet.py +6 -1
  393. transformers/models/rembert/modeling_rembert.py +6 -0
  394. transformers/models/rembert/tokenization_rembert.py +3 -10
  395. transformers/models/resnet/modeling_resnet.py +11 -2
  396. transformers/models/roberta/tokenization_roberta.py +18 -27
  397. transformers/models/roformer/modeling_roformer.py +6 -0
  398. transformers/models/roformer/tokenization_roformer.py +77 -412
  399. transformers/models/rt_detr/modeling_rt_detr.py +2 -0
  400. transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
  401. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
  402. transformers/models/rwkv/modeling_rwkv.py +1 -0
  403. transformers/models/sam2/modeling_sam2.py +2 -2
  404. transformers/models/sam2/modular_sam2.py +2 -2
  405. transformers/models/sam2_video/modeling_sam2_video.py +1 -0
  406. transformers/models/sam2_video/modular_sam2_video.py +1 -0
  407. transformers/models/sam3/modeling_sam3.py +77 -80
  408. transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
  409. transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
  410. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
  411. transformers/models/sam3_video/modeling_sam3_video.py +1 -0
  412. transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
  413. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  414. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
  415. transformers/models/seed_oss/modeling_seed_oss.py +2 -2
  416. transformers/models/segformer/modeling_segformer.py +4 -1
  417. transformers/models/seggpt/modeling_seggpt.py +2 -0
  418. transformers/models/sew/modeling_sew.py +3 -0
  419. transformers/models/sew/modular_sew.py +1 -0
  420. transformers/models/sew_d/modeling_sew_d.py +3 -0
  421. transformers/models/siglip2/modeling_siglip2.py +4 -0
  422. transformers/models/siglip2/modular_siglip2.py +4 -0
  423. transformers/models/smollm3/modeling_smollm3.py +4 -4
  424. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  425. transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
  426. transformers/models/speecht5/modeling_speecht5.py +13 -1
  427. transformers/models/splinter/modeling_splinter.py +3 -0
  428. transformers/models/splinter/tokenization_splinter.py +9 -28
  429. transformers/models/squeezebert/modeling_squeezebert.py +6 -0
  430. transformers/models/stablelm/modeling_stablelm.py +3 -1
  431. transformers/models/starcoder2/modeling_starcoder2.py +4 -3
  432. transformers/models/superglue/modeling_superglue.py +1 -0
  433. transformers/models/superpoint/modeling_superpoint.py +1 -0
  434. transformers/models/swiftformer/modeling_swiftformer.py +2 -0
  435. transformers/models/swin/modeling_swin.py +4 -0
  436. transformers/models/swin2sr/modeling_swin2sr.py +2 -0
  437. transformers/models/swinv2/modeling_swinv2.py +4 -0
  438. transformers/models/t5/modeling_t5.py +7 -0
  439. transformers/models/t5/tokenization_t5.py +4 -8
  440. transformers/models/t5gemma/modeling_t5gemma.py +5 -5
  441. transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
  442. transformers/models/table_transformer/modeling_table_transformer.py +4 -0
  443. transformers/models/tapas/modeling_tapas.py +3 -0
  444. transformers/models/textnet/modeling_textnet.py +11 -2
  445. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  446. transformers/models/timesfm/modeling_timesfm.py +2 -0
  447. transformers/models/timesfm/modular_timesfm.py +2 -0
  448. transformers/models/timesformer/modeling_timesformer.py +2 -0
  449. transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
  450. transformers/models/trocr/modeling_trocr.py +2 -0
  451. transformers/models/tvp/modeling_tvp.py +2 -0
  452. transformers/models/udop/modeling_udop.py +4 -0
  453. transformers/models/udop/tokenization_udop.py +5 -13
  454. transformers/models/umt5/modeling_umt5.py +7 -0
  455. transformers/models/unispeech/modeling_unispeech.py +4 -0
  456. transformers/models/unispeech/modular_unispeech.py +2 -0
  457. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  458. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  459. transformers/models/univnet/modeling_univnet.py +1 -0
  460. transformers/models/upernet/modeling_upernet.py +1 -0
  461. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  462. transformers/models/vilt/modeling_vilt.py +6 -0
  463. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  464. transformers/models/visual_bert/modeling_visual_bert.py +6 -0
  465. transformers/models/vitdet/modeling_vitdet.py +2 -0
  466. transformers/models/vitmatte/modeling_vitmatte.py +1 -0
  467. transformers/models/vits/modeling_vits.py +1 -0
  468. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  469. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  470. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
  471. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
  472. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
  473. transformers/models/wavlm/modeling_wavlm.py +5 -0
  474. transformers/models/whisper/modeling_whisper.py +6 -0
  475. transformers/models/whisper/tokenization_whisper.py +4 -15
  476. transformers/models/x_clip/modeling_x_clip.py +3 -0
  477. transformers/models/xglm/modeling_xglm.py +1 -0
  478. transformers/models/xglm/tokenization_xglm.py +4 -9
  479. transformers/models/xlm/modeling_xlm.py +5 -0
  480. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  481. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  482. transformers/models/yoso/modeling_yoso.py +6 -0
  483. transformers/models/zamba/modeling_zamba.py +2 -0
  484. transformers/models/zamba2/modeling_zamba2.py +4 -2
  485. transformers/models/zamba2/modular_zamba2.py +1 -1
  486. transformers/models/zoedepth/modeling_zoedepth.py +1 -0
  487. transformers/pipelines/__init__.py +2 -3
  488. transformers/pipelines/base.py +1 -9
  489. transformers/pipelines/document_question_answering.py +3 -1
  490. transformers/pipelines/text_generation.py +1 -1
  491. transformers/processing_utils.py +23 -11
  492. transformers/quantizers/base.py +35 -110
  493. transformers/quantizers/quantizer_aqlm.py +1 -5
  494. transformers/quantizers/quantizer_auto_round.py +1 -2
  495. transformers/quantizers/quantizer_awq.py +17 -81
  496. transformers/quantizers/quantizer_bitnet.py +3 -8
  497. transformers/quantizers/quantizer_bnb_4bit.py +13 -110
  498. transformers/quantizers/quantizer_bnb_8bit.py +16 -92
  499. transformers/quantizers/quantizer_compressed_tensors.py +1 -5
  500. transformers/quantizers/quantizer_eetq.py +14 -62
  501. transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
  502. transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
  503. transformers/quantizers/quantizer_fp_quant.py +48 -78
  504. transformers/quantizers/quantizer_gptq.py +7 -24
  505. transformers/quantizers/quantizer_higgs.py +40 -54
  506. transformers/quantizers/quantizer_hqq.py +144 -153
  507. transformers/quantizers/quantizer_mxfp4.py +13 -167
  508. transformers/quantizers/quantizer_quanto.py +20 -64
  509. transformers/quantizers/quantizer_quark.py +36 -17
  510. transformers/quantizers/quantizer_spqr.py +1 -4
  511. transformers/quantizers/quantizer_torchao.py +23 -202
  512. transformers/quantizers/quantizer_vptq.py +8 -22
  513. transformers/quantizers/quantizers_utils.py +20 -0
  514. transformers/testing_utils.py +297 -36
  515. transformers/tokenization_mistral_common.py +4 -0
  516. transformers/tokenization_utils_base.py +113 -222
  517. transformers/tokenization_utils_tokenizers.py +168 -107
  518. transformers/trainer.py +28 -31
  519. transformers/trainer_jit_checkpoint.py +126 -0
  520. transformers/trainer_utils.py +1 -1
  521. transformers/training_args.py +66 -28
  522. transformers/utils/__init__.py +3 -4
  523. transformers/utils/auto_docstring.py +1 -0
  524. transformers/utils/generic.py +27 -1
  525. transformers/utils/hub.py +5 -15
  526. transformers/utils/import_utils.py +61 -16
  527. transformers/utils/kernel_config.py +4 -2
  528. transformers/utils/loading_report.py +19 -10
  529. transformers/utils/quantization_config.py +75 -242
  530. transformers/video_processing_utils.py +1 -2
  531. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
  532. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
  533. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
  534. transformers/kernels/__init__.py +0 -0
  535. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  536. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  537. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
  538. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  539. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -773,6 +773,7 @@ class SEWModel(SEWPreTrainedModel):
773
773
  output_attentions: Optional[bool] = None,
774
774
  output_hidden_states: Optional[bool] = None,
775
775
  return_dict: Optional[bool] = None,
776
+ **kwargs,
776
777
  ) -> Union[tuple, BaseModelOutput]:
777
778
  r"""
778
779
  mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -902,6 +903,7 @@ class SEWForCTC(SEWPreTrainedModel):
902
903
  output_hidden_states: Optional[bool] = None,
903
904
  return_dict: Optional[bool] = None,
904
905
  labels: Optional[torch.Tensor] = None,
906
+ **kwargs,
905
907
  ) -> Union[tuple, CausalLMOutput]:
906
908
  r"""
907
909
  labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
@@ -1013,6 +1015,7 @@ class SEWForSequenceClassification(SEWPreTrainedModel):
1013
1015
  output_hidden_states: Optional[bool] = None,
1014
1016
  return_dict: Optional[bool] = None,
1015
1017
  labels: Optional[torch.Tensor] = None,
1018
+ **kwargs,
1016
1019
  ) -> Union[tuple, SequenceClassifierOutput]:
1017
1020
  r"""
1018
1021
  input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -392,6 +392,7 @@ class SEWModel(SEWPreTrainedModel):
392
392
  output_attentions: Optional[bool] = None,
393
393
  output_hidden_states: Optional[bool] = None,
394
394
  return_dict: Optional[bool] = None,
395
+ **kwargs,
395
396
  ) -> Union[tuple, BaseModelOutput]:
396
397
  r"""
397
398
  mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1318,6 +1318,7 @@ class SEWDModel(SEWDPreTrainedModel):
1318
1318
  output_attentions: Optional[bool] = None,
1319
1319
  output_hidden_states: Optional[bool] = None,
1320
1320
  return_dict: Optional[bool] = None,
1321
+ **kwargs,
1321
1322
  ) -> Union[tuple, BaseModelOutput]:
1322
1323
  r"""
1323
1324
  mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1445,6 +1446,7 @@ class SEWDForCTC(SEWDPreTrainedModel):
1445
1446
  output_hidden_states: Optional[bool] = None,
1446
1447
  return_dict: Optional[bool] = None,
1447
1448
  labels: Optional[torch.Tensor] = None,
1449
+ **kwargs,
1448
1450
  ) -> Union[tuple, CausalLMOutput]:
1449
1451
  r"""
1450
1452
  labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
@@ -1557,6 +1559,7 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel):
1557
1559
  output_hidden_states: Optional[bool] = None,
1558
1560
  return_dict: Optional[bool] = None,
1559
1561
  labels: Optional[torch.Tensor] = None,
1562
+ **kwargs,
1560
1563
  ) -> Union[tuple, SequenceClassifierOutput]:
1561
1564
  r"""
1562
1565
  input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -510,6 +510,7 @@ class Siglip2VisionTransformer(Siglip2PreTrainedModel):
510
510
  spatial_shapes: torch.LongTensor,
511
511
  output_attentions: Optional[bool] = None,
512
512
  output_hidden_states: Optional[bool] = None,
513
+ **kwargs,
513
514
  ) -> BaseModelOutputWithPooling:
514
515
  r"""
515
516
  spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
@@ -760,6 +761,7 @@ class Siglip2VisionModel(Siglip2PreTrainedModel):
760
761
  spatial_shapes: torch.LongTensor,
761
762
  output_attentions: Optional[bool] = None,
762
763
  output_hidden_states: Optional[bool] = None,
764
+ **kwargs,
763
765
  ) -> BaseModelOutputWithPooling:
764
766
  r"""
765
767
  pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
@@ -927,6 +929,7 @@ class Siglip2Model(Siglip2PreTrainedModel):
927
929
  return_loss: Optional[bool] = None,
928
930
  output_attentions: Optional[bool] = None,
929
931
  output_hidden_states: Optional[bool] = None,
932
+ **kwargs,
930
933
  ) -> Siglip2Output:
931
934
  r"""
932
935
  pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
@@ -1058,6 +1061,7 @@ class Siglip2ForImageClassification(Siglip2PreTrainedModel):
1058
1061
  labels: Optional[torch.Tensor] = None,
1059
1062
  output_attentions: Optional[bool] = None,
1060
1063
  output_hidden_states: Optional[bool] = None,
1064
+ **kwargs,
1061
1065
  ) -> ImageClassifierOutput:
1062
1066
  r"""
1063
1067
  pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
@@ -247,6 +247,7 @@ class Siglip2VisionTransformer(SiglipVisionTransformer):
247
247
  spatial_shapes: torch.LongTensor,
248
248
  output_attentions: Optional[bool] = None,
249
249
  output_hidden_states: Optional[bool] = None,
250
+ **kwargs,
250
251
  ) -> BaseModelOutputWithPooling:
251
252
  r"""
252
253
  spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
@@ -324,6 +325,7 @@ class Siglip2VisionModel(SiglipVisionModel):
324
325
  spatial_shapes: torch.LongTensor,
325
326
  output_attentions: Optional[bool] = None,
326
327
  output_hidden_states: Optional[bool] = None,
328
+ **kwargs,
327
329
  ) -> BaseModelOutputWithPooling:
328
330
  r"""
329
331
  pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
@@ -419,6 +421,7 @@ class Siglip2Model(SiglipModel):
419
421
  return_loss: Optional[bool] = None,
420
422
  output_attentions: Optional[bool] = None,
421
423
  output_hidden_states: Optional[bool] = None,
424
+ **kwargs,
422
425
  ) -> Siglip2Output:
423
426
  r"""
424
427
  pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
@@ -522,6 +525,7 @@ class Siglip2ForImageClassification(SiglipForImageClassification):
522
525
  labels: Optional[torch.Tensor] = None,
523
526
  output_attentions: Optional[bool] = None,
524
527
  output_hidden_states: Optional[bool] = None,
528
+ **kwargs,
525
529
  ) -> ImageClassifierOutput:
526
530
  r"""
527
531
  pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
@@ -28,7 +28,7 @@ from torch import nn
28
28
  from ...activations import ACT2FN
29
29
  from ...cache_utils import Cache, DynamicCache
30
30
  from ...generation import GenerationMixin
31
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
31
+ from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
32
32
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
33
33
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
34
34
  from ...modeling_layers import (
@@ -42,7 +42,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
42
42
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
43
43
  from ...processing_utils import Unpack
44
44
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
45
- from ...utils.generic import check_model_inputs
45
+ from ...utils.generic import check_model_inputs, maybe_autocast
46
46
  from .configuration_smollm3 import SmolLM3Config
47
47
 
48
48
 
@@ -102,7 +102,7 @@ class SmolLM3RotaryEmbedding(nn.Module):
102
102
  position_ids_expanded = position_ids[:, None, :].float()
103
103
 
104
104
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
105
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
105
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
106
106
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
107
107
  emb = torch.cat((freqs, freqs), dim=-1)
108
108
  cos = emb.cos() * self.attention_scaling
@@ -184,6 +184,7 @@ def eager_attention_forward(
184
184
  return attn_output, attn_weights
185
185
 
186
186
 
187
+ @use_kernelized_func(apply_rotary_pos_emb)
187
188
  class SmolLM3Attention(nn.Module):
188
189
  """Multi-headed attention from 'Attention Is All You Need' paper"""
189
190
 
@@ -209,7 +210,6 @@ class SmolLM3Attention(nn.Module):
209
210
  self.o_proj = nn.Linear(
210
211
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
211
212
  )
212
- self.rotary_fn = apply_rotary_pos_emb
213
213
 
214
214
  self.use_rope = config.no_rope_layers[layer_idx]
215
215
  self.sliding_window = (
@@ -27,13 +27,6 @@ from ...utils import is_num2words_available, is_vision_available, logging
27
27
  from ...video_utils import VideoInput
28
28
 
29
29
 
30
- if is_vision_available():
31
- from .video_processing_smolvlm import (
32
- DEFAULT_MEDIA_OUTTRO,
33
- DEFAULT_VIDEO_INTRO,
34
- FRAME_TIMESTAMP_MESSAGE,
35
- )
36
-
37
30
  if is_vision_available():
38
31
  from .video_processing_smolvlm import (
39
32
  DEFAULT_MEDIA_OUTTRO,
@@ -567,6 +567,7 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
567
567
  output_attentions=None,
568
568
  output_hidden_states=None,
569
569
  return_dict=None,
570
+ **kwargs,
570
571
  ):
571
572
  r"""
572
573
  Args:
@@ -707,6 +708,7 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
707
708
  output_hidden_states=None,
708
709
  return_dict=None,
709
710
  cache_position=None,
711
+ **kwargs,
710
712
  ):
711
713
  r"""
712
714
  Args:
@@ -899,6 +901,7 @@ class Speech2TextModel(Speech2TextPreTrainedModel):
899
901
  output_hidden_states: Optional[bool] = None,
900
902
  return_dict: Optional[bool] = None,
901
903
  cache_position: Optional[torch.Tensor] = None,
904
+ **kwargs,
902
905
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
903
906
  r"""
904
907
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1035,6 +1038,7 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel, Generation
1035
1038
  output_hidden_states: Optional[bool] = None,
1036
1039
  return_dict: Optional[bool] = None,
1037
1040
  cache_position: Optional[torch.Tensor] = None,
1041
+ **kwargs,
1038
1042
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
1039
1043
  r"""
1040
1044
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1239,6 +1239,7 @@ class SpeechT5Encoder(SpeechT5PreTrainedModel):
1239
1239
  output_attentions: Optional[bool] = None,
1240
1240
  output_hidden_states: Optional[bool] = None,
1241
1241
  return_dict: Optional[bool] = None,
1242
+ **kwargs,
1242
1243
  ) -> Union[tuple, BaseModelOutput]:
1243
1244
  """
1244
1245
  Args:
@@ -1342,6 +1343,7 @@ class SpeechT5EncoderWithSpeechPrenet(SpeechT5PreTrainedModel):
1342
1343
  output_attentions: Optional[bool] = None,
1343
1344
  output_hidden_states: Optional[bool] = None,
1344
1345
  return_dict: Optional[bool] = None,
1346
+ **kwargs,
1345
1347
  ) -> Union[tuple, BaseModelOutput]:
1346
1348
  hidden_states, attention_mask = self.prenet(input_values, attention_mask)
1347
1349
 
@@ -1382,6 +1384,7 @@ class SpeechT5EncoderWithTextPrenet(SpeechT5PreTrainedModel):
1382
1384
  output_attentions: Optional[bool] = None,
1383
1385
  output_hidden_states: Optional[bool] = None,
1384
1386
  return_dict: Optional[bool] = None,
1387
+ **kwargs,
1385
1388
  ) -> Union[tuple, BaseModelOutput]:
1386
1389
  hidden_states = self.prenet(input_values)
1387
1390
 
@@ -1416,6 +1419,7 @@ class SpeechT5EncoderWithoutPrenet(SpeechT5PreTrainedModel):
1416
1419
  output_attentions: Optional[bool] = None,
1417
1420
  output_hidden_states: Optional[bool] = None,
1418
1421
  return_dict: Optional[bool] = None,
1422
+ **kwargs,
1419
1423
  ) -> Union[tuple, BaseModelOutput]:
1420
1424
  return self.wrapped_encoder(
1421
1425
  hidden_states=input_values,
@@ -1454,6 +1458,7 @@ class SpeechT5Decoder(SpeechT5PreTrainedModel):
1454
1458
  output_hidden_states: Optional[bool] = None,
1455
1459
  return_dict: Optional[bool] = None,
1456
1460
  cache_position: Optional[torch.Tensor] = None,
1461
+ **kwargs,
1457
1462
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
1458
1463
  r"""
1459
1464
  Args:
@@ -1613,6 +1618,7 @@ class SpeechT5DecoderWithSpeechPrenet(SpeechT5PreTrainedModel):
1613
1618
  output_hidden_states: Optional[bool] = None,
1614
1619
  return_dict: Optional[bool] = None,
1615
1620
  cache_position: Optional[torch.Tensor] = None,
1621
+ **kwargs,
1616
1622
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
1617
1623
  decoder_hidden_states = self.prenet(input_values, speaker_embeddings)
1618
1624
 
@@ -1663,6 +1669,7 @@ class SpeechT5DecoderWithTextPrenet(SpeechT5PreTrainedModel):
1663
1669
  output_hidden_states: Optional[bool] = None,
1664
1670
  return_dict: Optional[bool] = None,
1665
1671
  cache_position: Optional[torch.Tensor] = None,
1672
+ **kwargs,
1666
1673
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
1667
1674
  decoder_hidden_states, attention_mask = self.prenet(input_values, attention_mask, past_key_values)
1668
1675
 
@@ -1707,6 +1714,7 @@ class SpeechT5DecoderWithoutPrenet(SpeechT5PreTrainedModel):
1707
1714
  output_hidden_states: Optional[bool] = None,
1708
1715
  return_dict: Optional[bool] = None,
1709
1716
  cache_position: Optional[torch.Tensor] = None,
1717
+ **kwargs,
1710
1718
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
1711
1719
  outputs = self.wrapped_decoder(
1712
1720
  hidden_states=input_values,
@@ -1905,6 +1913,7 @@ class SpeechT5Model(SpeechT5PreTrainedModel):
1905
1913
  output_hidden_states: Optional[bool] = None,
1906
1914
  return_dict: Optional[bool] = None,
1907
1915
  cache_position: Optional[torch.Tensor] = None,
1916
+ **kwargs,
1908
1917
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
1909
1918
  r"""
1910
1919
  input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
@@ -2046,6 +2055,7 @@ class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel, GenerationMixin):
2046
2055
  return_dict: Optional[bool] = None,
2047
2056
  labels: Optional[torch.LongTensor] = None,
2048
2057
  cache_position: Optional[torch.Tensor] = None,
2058
+ **kwargs,
2049
2059
  ) -> Union[tuple, Seq2SeqLMOutput]:
2050
2060
  r"""
2051
2061
  input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -2356,6 +2366,7 @@ class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
2356
2366
  labels: Optional[torch.FloatTensor] = None,
2357
2367
  stop_labels: Optional[torch.Tensor] = None,
2358
2368
  cache_position: Optional[torch.Tensor] = None,
2369
+ **kwargs,
2359
2370
  ) -> Union[tuple, Seq2SeqSpectrogramOutput]:
2360
2371
  r"""
2361
2372
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -2694,6 +2705,7 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
2694
2705
  labels: Optional[torch.FloatTensor] = None,
2695
2706
  stop_labels: Optional[torch.Tensor] = None,
2696
2707
  cache_position: Optional[torch.Tensor] = None,
2708
+ **kwargs,
2697
2709
  ) -> Union[tuple, Seq2SeqSpectrogramOutput]:
2698
2710
  r"""
2699
2711
  input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -3023,7 +3035,7 @@ class SpeechT5HifiGan(PreTrainedModel):
3023
3035
  waveform.
3024
3036
  """
3025
3037
  )
3026
- def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
3038
+ def forward(self, spectrogram: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
3027
3039
  r"""
3028
3040
  spectrogram (`torch.FloatTensor`):
3029
3041
  Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
@@ -368,6 +368,7 @@ class SplinterModel(SplinterPreTrainedModel):
368
368
  output_attentions: Optional[bool] = None,
369
369
  output_hidden_states: Optional[bool] = None,
370
370
  return_dict: Optional[bool] = None,
371
+ **kwargs,
371
372
  ) -> Union[tuple, BaseModelOutput]:
372
373
  r"""
373
374
  token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
@@ -516,6 +517,7 @@ class SplinterForQuestionAnswering(SplinterPreTrainedModel):
516
517
  output_hidden_states: Optional[bool] = None,
517
518
  return_dict: Optional[bool] = None,
518
519
  question_positions: Optional[torch.LongTensor] = None,
520
+ **kwargs,
519
521
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
520
522
  r"""
521
523
  token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
@@ -658,6 +660,7 @@ class SplinterForPreTraining(SplinterPreTrainedModel):
658
660
  output_hidden_states: Optional[bool] = None,
659
661
  return_dict: Optional[bool] = None,
660
662
  question_positions: Optional[torch.LongTensor] = None,
663
+ **kwargs,
661
664
  ) -> Union[tuple, SplinterForPreTrainingOutput]:
662
665
  r"""
663
666
  input_ids (`torch.LongTensor` of shape `(batch_size, num_questions, sequence_length)`):
@@ -16,7 +16,7 @@
16
16
  """Tokenization classes for Splinter."""
17
17
 
18
18
  import collections
19
- from typing import Optional
19
+ from typing import Optional, Union
20
20
 
21
21
  from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
22
22
  from tokenizers.models import WordPiece
@@ -72,16 +72,17 @@ class SplinterTokenizer(TokenizersBackend):
72
72
  strip_accents (`bool`, *optional*):
73
73
  Whether or not to strip all accents. If this option is not specified, then it will be determined by the
74
74
  value for `lowercase`.
75
- vocab (`dict`, *optional*):
75
+ vocab (`str`, `dict` or `list`, *optional*):
76
76
  Custom vocabulary dictionary. If not provided, a minimal vocabulary is created.
77
77
  """
78
78
 
79
79
  vocab_files_names = VOCAB_FILES_NAMES
80
80
  model_input_names = ["input_ids", "attention_mask"]
81
- slow_tokenizer_class = None
81
+ model = WordPiece
82
82
 
83
83
  def __init__(
84
84
  self,
85
+ vocab: Optional[Union[str, dict[str, int]]] = None,
85
86
  do_lower_case: bool = True,
86
87
  unk_token: str = "[UNK]",
87
88
  sep_token: str = "[SEP]",
@@ -91,15 +92,12 @@ class SplinterTokenizer(TokenizersBackend):
91
92
  question_token: str = "[QUESTION]",
92
93
  tokenize_chinese_chars: bool = True,
93
94
  strip_accents: Optional[bool] = None,
94
- vocab: Optional[dict] = None,
95
95
  **kwargs,
96
96
  ):
97
- if vocab is not None:
98
- self._vocab = (
99
- {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
100
- )
101
- else:
102
- self._vocab = {
97
+ self._vocab = (
98
+ vocab
99
+ if vocab is not None
100
+ else {
103
101
  str(pad_token): 0,
104
102
  str(unk_token): 1,
105
103
  str(cls_token): 2,
@@ -108,6 +106,7 @@ class SplinterTokenizer(TokenizersBackend):
108
106
  str(question_token): 5,
109
107
  ".": 6,
110
108
  }
109
+ )
111
110
 
112
111
  self._tokenizer = Tokenizer(WordPiece(self._vocab, unk_token=str(unk_token)))
113
112
 
@@ -120,10 +119,7 @@ class SplinterTokenizer(TokenizersBackend):
120
119
  self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
121
120
  self._tokenizer.decoder = decoders.WordPiece(prefix="##")
122
121
 
123
- tokenizer_object = self._tokenizer
124
-
125
122
  super().__init__(
126
- tokenizer_object=tokenizer_object,
127
123
  unk_token=unk_token,
128
124
  sep_token=sep_token,
129
125
  pad_token=pad_token,
@@ -136,21 +132,6 @@ class SplinterTokenizer(TokenizersBackend):
136
132
  **kwargs,
137
133
  )
138
134
 
139
- if hasattr(self, "_tokenizer") and self._tokenizer.normalizer is not None:
140
- import json
141
-
142
- pre_tok_state = json.loads(self._tokenizer.normalizer.__getstate__())
143
- if (
144
- pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
145
- or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
146
- or pre_tok_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
147
- ):
148
- pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
149
- pre_tok_state["lowercase"] = do_lower_case
150
- pre_tok_state["strip_accents"] = strip_accents
151
- pre_tok_state["handle_chinese_chars"] = tokenize_chinese_chars
152
- self._tokenizer.normalizer = pre_tok_class(**pre_tok_state)
153
-
154
135
  self.do_lower_case = do_lower_case
155
136
  self.tokenize_chinese_chars = tokenize_chinese_chars
156
137
  self.strip_accents = strip_accents
@@ -443,6 +443,7 @@ class SqueezeBertModel(SqueezeBertPreTrainedModel):
443
443
  output_attentions: Optional[bool] = None,
444
444
  output_hidden_states: Optional[bool] = None,
445
445
  return_dict: Optional[bool] = None,
446
+ **kwargs,
446
447
  ) -> Union[tuple, BaseModelOutputWithPooling]:
447
448
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
448
449
  output_hidden_states = (
@@ -528,6 +529,7 @@ class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel):
528
529
  output_attentions: Optional[bool] = None,
529
530
  output_hidden_states: Optional[bool] = None,
530
531
  return_dict: Optional[bool] = None,
532
+ **kwargs,
531
533
  ) -> Union[tuple, MaskedLMOutput]:
532
534
  r"""
533
535
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -599,6 +601,7 @@ class SqueezeBertForSequenceClassification(SqueezeBertPreTrainedModel):
599
601
  output_attentions: Optional[bool] = None,
600
602
  output_hidden_states: Optional[bool] = None,
601
603
  return_dict: Optional[bool] = None,
604
+ **kwargs,
602
605
  ) -> Union[tuple, SequenceClassifierOutput]:
603
606
  r"""
604
607
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -683,6 +686,7 @@ class SqueezeBertForMultipleChoice(SqueezeBertPreTrainedModel):
683
686
  output_attentions: Optional[bool] = None,
684
687
  output_hidden_states: Optional[bool] = None,
685
688
  return_dict: Optional[bool] = None,
689
+ **kwargs,
686
690
  ) -> Union[tuple, MultipleChoiceModelOutput]:
687
691
  r"""
688
692
  input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
@@ -786,6 +790,7 @@ class SqueezeBertForTokenClassification(SqueezeBertPreTrainedModel):
786
790
  output_attentions: Optional[bool] = None,
787
791
  output_hidden_states: Optional[bool] = None,
788
792
  return_dict: Optional[bool] = None,
793
+ **kwargs,
789
794
  ) -> Union[tuple, TokenClassifierOutput]:
790
795
  r"""
791
796
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -851,6 +856,7 @@ class SqueezeBertForQuestionAnswering(SqueezeBertPreTrainedModel):
851
856
  output_attentions: Optional[bool] = None,
852
857
  output_hidden_states: Optional[bool] = None,
853
858
  return_dict: Optional[bool] = None,
859
+ **kwargs,
854
860
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
855
861
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
856
862
 
@@ -45,6 +45,7 @@ from ...modeling_rope_utils import (
45
45
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
46
46
  from ...processing_utils import Unpack
47
47
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
48
+ from ...utils.generic import maybe_autocast
48
49
  from .configuration_stablelm import StableLmConfig
49
50
 
50
51
 
@@ -117,7 +118,7 @@ class StableLmRotaryEmbedding(nn.Module):
117
118
  position_ids_expanded = position_ids[:, None, :].float()
118
119
 
119
120
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
120
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
121
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
121
122
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
122
123
  emb = torch.cat((freqs, freqs), dim=-1)
123
124
  cos = emb.cos() * self.attention_scaling
@@ -492,6 +493,7 @@ class StableLmModel(StableLmPreTrainedModel):
492
493
  output_attentions: Optional[bool] = None,
493
494
  output_hidden_states: Optional[bool] = None,
494
495
  cache_position: Optional[torch.LongTensor] = None,
496
+ **kwargs,
495
497
  ) -> BaseModelOutputWithPast:
496
498
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
497
499
  output_hidden_states = (
@@ -35,7 +35,7 @@ from transformers.utils.generic import check_model_inputs
35
35
  from ...activations import ACT2FN
36
36
  from ...cache_utils import Cache, DynamicCache
37
37
  from ...generation import GenerationMixin
38
- from ...integrations import use_kernel_func_from_hub
38
+ from ...integrations import use_kernel_func_from_hub, use_kernelized_func
39
39
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
40
40
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
41
41
  from ...modeling_layers import (
@@ -48,6 +48,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
48
48
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
49
49
  from ...processing_utils import Unpack
50
50
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
51
+ from ...utils.generic import maybe_autocast
51
52
  from .configuration_starcoder2 import Starcoder2Config
52
53
 
53
54
 
@@ -141,6 +142,7 @@ def eager_attention_forward(
141
142
  return attn_output, attn_weights
142
143
 
143
144
 
145
+ @use_kernelized_func(apply_rotary_pos_emb)
144
146
  class Starcoder2Attention(nn.Module):
145
147
  """Multi-headed attention from 'Attention Is All You Need' paper"""
146
148
 
@@ -157,7 +159,6 @@ class Starcoder2Attention(nn.Module):
157
159
  self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
158
160
  self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
159
161
  self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias)
160
- self.rotary_fn = apply_rotary_pos_emb
161
162
  self.residual_dropout = config.residual_dropout
162
163
 
163
164
  def forward(
@@ -327,7 +328,7 @@ class Starcoder2RotaryEmbedding(nn.Module):
327
328
  position_ids_expanded = position_ids[:, None, :].float()
328
329
 
329
330
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
330
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
331
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
331
332
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
332
333
  emb = torch.cat((freqs, freqs), dim=-1)
333
334
  cos = emb.cos() * self.attention_scaling
@@ -670,6 +670,7 @@ class SuperGlueForKeypointMatching(SuperGluePreTrainedModel):
670
670
  output_attentions: Optional[bool] = None,
671
671
  output_hidden_states: Optional[bool] = None,
672
672
  return_dict: Optional[bool] = None,
673
+ **kwargs,
673
674
  ) -> Union[tuple, SuperGlueKeypointMatchingOutput]:
674
675
  r"""
675
676
  Examples:
@@ -378,6 +378,7 @@ class SuperPointForKeypointDetection(SuperPointPreTrainedModel):
378
378
  labels: Optional[torch.LongTensor] = None,
379
379
  output_hidden_states: Optional[bool] = None,
380
380
  return_dict: Optional[bool] = None,
381
+ **kwargs,
381
382
  ) -> Union[tuple, SuperPointKeypointDescriptionOutput]:
382
383
  r"""
383
384
  Examples:
@@ -428,6 +428,7 @@ class SwiftFormerModel(SwiftFormerPreTrainedModel):
428
428
  pixel_values: Optional[torch.Tensor] = None,
429
429
  output_hidden_states: Optional[bool] = None,
430
430
  return_dict: Optional[bool] = None,
431
+ **kwargs,
431
432
  ) -> Union[tuple, BaseModelOutputWithNoAttention]:
432
433
  output_hidden_states = (
433
434
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -478,6 +479,7 @@ class SwiftFormerForImageClassification(SwiftFormerPreTrainedModel):
478
479
  labels: Optional[torch.Tensor] = None,
479
480
  output_hidden_states: Optional[bool] = None,
480
481
  return_dict: Optional[bool] = None,
482
+ **kwargs,
481
483
  ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
482
484
  r"""
483
485
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -860,6 +860,7 @@ class SwinModel(SwinPreTrainedModel):
860
860
  output_hidden_states: Optional[bool] = None,
861
861
  interpolate_pos_encoding: bool = False,
862
862
  return_dict: Optional[bool] = None,
863
+ **kwargs,
863
864
  ) -> Union[tuple, SwinModelOutput]:
864
865
  r"""
865
866
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
@@ -946,6 +947,7 @@ class SwinForMaskedImageModeling(SwinPreTrainedModel):
946
947
  output_hidden_states: Optional[bool] = None,
947
948
  interpolate_pos_encoding: bool = False,
948
949
  return_dict: Optional[bool] = None,
950
+ **kwargs,
949
951
  ) -> Union[tuple, SwinMaskedImageModelingOutput]:
950
952
  r"""
951
953
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
@@ -1059,6 +1061,7 @@ class SwinForImageClassification(SwinPreTrainedModel):
1059
1061
  output_hidden_states: Optional[bool] = None,
1060
1062
  interpolate_pos_encoding: bool = False,
1061
1063
  return_dict: Optional[bool] = None,
1064
+ **kwargs,
1062
1065
  ) -> Union[tuple, SwinImageClassifierOutput]:
1063
1066
  r"""
1064
1067
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1129,6 +1132,7 @@ class SwinBackbone(SwinPreTrainedModel, BackboneMixin):
1129
1132
  output_hidden_states: Optional[bool] = None,
1130
1133
  output_attentions: Optional[bool] = None,
1131
1134
  return_dict: Optional[bool] = None,
1135
+ **kwargs,
1132
1136
  ) -> BackboneOutput:
1133
1137
  """
1134
1138
  Returns:
@@ -754,6 +754,7 @@ class Swin2SRModel(Swin2SRPreTrainedModel):
754
754
  output_attentions: Optional[bool] = None,
755
755
  output_hidden_states: Optional[bool] = None,
756
756
  return_dict: Optional[bool] = None,
757
+ **kwargs,
757
758
  ) -> Union[tuple, BaseModelOutput]:
758
759
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
759
760
  output_hidden_states = (
@@ -972,6 +973,7 @@ class Swin2SRForImageSuperResolution(Swin2SRPreTrainedModel):
972
973
  output_attentions: Optional[bool] = None,
973
974
  output_hidden_states: Optional[bool] = None,
974
975
  return_dict: Optional[bool] = None,
976
+ **kwargs,
975
977
  ) -> Union[tuple, ImageSuperResolutionOutput]:
976
978
  r"""
977
979
  Example:
@@ -942,6 +942,7 @@ class Swinv2Model(Swinv2PreTrainedModel):
942
942
  output_hidden_states: Optional[bool] = None,
943
943
  interpolate_pos_encoding: bool = False,
944
944
  return_dict: Optional[bool] = None,
945
+ **kwargs,
945
946
  ) -> Union[tuple, Swinv2ModelOutput]:
946
947
  r"""
947
948
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
@@ -1030,6 +1031,7 @@ class Swinv2ForMaskedImageModeling(Swinv2PreTrainedModel):
1030
1031
  output_hidden_states: Optional[bool] = None,
1031
1032
  interpolate_pos_encoding: bool = False,
1032
1033
  return_dict: Optional[bool] = None,
1034
+ **kwargs,
1033
1035
  ) -> Union[tuple, Swinv2MaskedImageModelingOutput]:
1034
1036
  r"""
1035
1037
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
@@ -1144,6 +1146,7 @@ class Swinv2ForImageClassification(Swinv2PreTrainedModel):
1144
1146
  output_hidden_states: Optional[bool] = None,
1145
1147
  interpolate_pos_encoding: bool = False,
1146
1148
  return_dict: Optional[bool] = None,
1149
+ **kwargs,
1147
1150
  ) -> Union[tuple, Swinv2ImageClassifierOutput]:
1148
1151
  r"""
1149
1152
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1209,6 +1212,7 @@ class Swinv2Backbone(Swinv2PreTrainedModel, BackboneMixin):
1209
1212
  output_attentions: Optional[bool] = None,
1210
1213
  output_hidden_states: Optional[bool] = None,
1211
1214
  return_dict: Optional[bool] = None,
1215
+ **kwargs,
1212
1216
  ) -> BackboneOutput:
1213
1217
  r"""
1214
1218
  Examples: