transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,7 @@
16
16
  """Tokenization classes for Splinter."""
17
17
 
18
18
  import collections
19
- from typing import Optional
19
+ from typing import Optional, Union
20
20
 
21
21
  from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
22
22
  from tokenizers.models import WordPiece
@@ -72,16 +72,17 @@ class SplinterTokenizer(TokenizersBackend):
72
72
  strip_accents (`bool`, *optional*):
73
73
  Whether or not to strip all accents. If this option is not specified, then it will be determined by the
74
74
  value for `lowercase`.
75
- vocab (`dict`, *optional*):
75
+ vocab (`str`, `dict` or `list`, *optional*):
76
76
  Custom vocabulary dictionary. If not provided, a minimal vocabulary is created.
77
77
  """
78
78
 
79
79
  vocab_files_names = VOCAB_FILES_NAMES
80
80
  model_input_names = ["input_ids", "attention_mask"]
81
- slow_tokenizer_class = None
81
+ model = WordPiece
82
82
 
83
83
  def __init__(
84
84
  self,
85
+ vocab: Optional[Union[str, dict[str, int]]] = None,
85
86
  do_lower_case: bool = True,
86
87
  unk_token: str = "[UNK]",
87
88
  sep_token: str = "[SEP]",
@@ -91,15 +92,12 @@ class SplinterTokenizer(TokenizersBackend):
91
92
  question_token: str = "[QUESTION]",
92
93
  tokenize_chinese_chars: bool = True,
93
94
  strip_accents: Optional[bool] = None,
94
- vocab: Optional[dict] = None,
95
95
  **kwargs,
96
96
  ):
97
- if vocab is not None:
98
- self._vocab = (
99
- {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
100
- )
101
- else:
102
- self._vocab = {
97
+ self._vocab = (
98
+ vocab
99
+ if vocab is not None
100
+ else {
103
101
  str(pad_token): 0,
104
102
  str(unk_token): 1,
105
103
  str(cls_token): 2,
@@ -108,6 +106,7 @@ class SplinterTokenizer(TokenizersBackend):
108
106
  str(question_token): 5,
109
107
  ".": 6,
110
108
  }
109
+ )
111
110
 
112
111
  self._tokenizer = Tokenizer(WordPiece(self._vocab, unk_token=str(unk_token)))
113
112
 
@@ -120,10 +119,7 @@ class SplinterTokenizer(TokenizersBackend):
120
119
  self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
121
120
  self._tokenizer.decoder = decoders.WordPiece(prefix="##")
122
121
 
123
- tokenizer_object = self._tokenizer
124
-
125
122
  super().__init__(
126
- tokenizer_object=tokenizer_object,
127
123
  unk_token=unk_token,
128
124
  sep_token=sep_token,
129
125
  pad_token=pad_token,
@@ -136,21 +132,6 @@ class SplinterTokenizer(TokenizersBackend):
136
132
  **kwargs,
137
133
  )
138
134
 
139
- if hasattr(self, "_tokenizer") and self._tokenizer.normalizer is not None:
140
- import json
141
-
142
- pre_tok_state = json.loads(self._tokenizer.normalizer.__getstate__())
143
- if (
144
- pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
145
- or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
146
- or pre_tok_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
147
- ):
148
- pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
149
- pre_tok_state["lowercase"] = do_lower_case
150
- pre_tok_state["strip_accents"] = strip_accents
151
- pre_tok_state["handle_chinese_chars"] = tokenize_chinese_chars
152
- self._tokenizer.normalizer = pre_tok_class(**pre_tok_state)
153
-
154
135
  self.do_lower_case = do_lower_case
155
136
  self.tokenize_chinese_chars = tokenize_chinese_chars
156
137
  self.strip_accents = strip_accents
@@ -412,6 +412,8 @@ class SqueezeBertPreTrainedModel(PreTrainedModel):
412
412
  super()._init_weights(module)
413
413
  if isinstance(module, SqueezeBertLMPredictionHead):
414
414
  init.zeros_(module.bias)
415
+ elif isinstance(module, SqueezeBertEmbeddings):
416
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
415
417
 
416
418
 
417
419
  @auto_docstring
@@ -443,6 +445,7 @@ class SqueezeBertModel(SqueezeBertPreTrainedModel):
443
445
  output_attentions: Optional[bool] = None,
444
446
  output_hidden_states: Optional[bool] = None,
445
447
  return_dict: Optional[bool] = None,
448
+ **kwargs,
446
449
  ) -> Union[tuple, BaseModelOutputWithPooling]:
447
450
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
448
451
  output_hidden_states = (
@@ -528,6 +531,7 @@ class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel):
528
531
  output_attentions: Optional[bool] = None,
529
532
  output_hidden_states: Optional[bool] = None,
530
533
  return_dict: Optional[bool] = None,
534
+ **kwargs,
531
535
  ) -> Union[tuple, MaskedLMOutput]:
532
536
  r"""
533
537
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -599,6 +603,7 @@ class SqueezeBertForSequenceClassification(SqueezeBertPreTrainedModel):
599
603
  output_attentions: Optional[bool] = None,
600
604
  output_hidden_states: Optional[bool] = None,
601
605
  return_dict: Optional[bool] = None,
606
+ **kwargs,
602
607
  ) -> Union[tuple, SequenceClassifierOutput]:
603
608
  r"""
604
609
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -683,6 +688,7 @@ class SqueezeBertForMultipleChoice(SqueezeBertPreTrainedModel):
683
688
  output_attentions: Optional[bool] = None,
684
689
  output_hidden_states: Optional[bool] = None,
685
690
  return_dict: Optional[bool] = None,
691
+ **kwargs,
686
692
  ) -> Union[tuple, MultipleChoiceModelOutput]:
687
693
  r"""
688
694
  input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
@@ -786,6 +792,7 @@ class SqueezeBertForTokenClassification(SqueezeBertPreTrainedModel):
786
792
  output_attentions: Optional[bool] = None,
787
793
  output_hidden_states: Optional[bool] = None,
788
794
  return_dict: Optional[bool] = None,
795
+ **kwargs,
789
796
  ) -> Union[tuple, TokenClassifierOutput]:
790
797
  r"""
791
798
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -851,6 +858,7 @@ class SqueezeBertForQuestionAnswering(SqueezeBertPreTrainedModel):
851
858
  output_attentions: Optional[bool] = None,
852
859
  output_hidden_states: Optional[bool] = None,
853
860
  return_dict: Optional[bool] = None,
861
+ **kwargs,
854
862
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
855
863
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
856
864
 
@@ -45,6 +45,7 @@ from ...modeling_rope_utils import (
45
45
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
46
46
  from ...processing_utils import Unpack
47
47
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
48
+ from ...utils.generic import maybe_autocast
48
49
  from .configuration_stablelm import StableLmConfig
49
50
 
50
51
 
@@ -75,7 +76,7 @@ class StableLmRotaryEmbedding(nn.Module):
75
76
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
76
77
 
77
78
  self.register_buffer("inv_freq", inv_freq, persistent=False)
78
- self.original_inv_freq = inv_freq
79
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
79
80
 
80
81
  @staticmethod
81
82
  # Ignore copy
@@ -117,7 +118,7 @@ class StableLmRotaryEmbedding(nn.Module):
117
118
  position_ids_expanded = position_ids[:, None, :].float()
118
119
 
119
120
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
120
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
121
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
121
122
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
122
123
  emb = torch.cat((freqs, freqs), dim=-1)
123
124
  cos = emb.cos() * self.attention_scaling
@@ -492,6 +493,7 @@ class StableLmModel(StableLmPreTrainedModel):
492
493
  output_attentions: Optional[bool] = None,
493
494
  output_hidden_states: Optional[bool] = None,
494
495
  cache_position: Optional[torch.LongTensor] = None,
496
+ **kwargs,
495
497
  ) -> BaseModelOutputWithPast:
496
498
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
497
499
  output_hidden_states = (
@@ -35,7 +35,7 @@ from transformers.utils.generic import check_model_inputs
35
35
  from ...activations import ACT2FN
36
36
  from ...cache_utils import Cache, DynamicCache
37
37
  from ...generation import GenerationMixin
38
- from ...integrations import use_kernel_func_from_hub
38
+ from ...integrations import use_kernel_func_from_hub, use_kernelized_func
39
39
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
40
40
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
41
41
  from ...modeling_layers import (
@@ -48,6 +48,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
48
48
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
49
49
  from ...processing_utils import Unpack
50
50
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
51
+ from ...utils.generic import maybe_autocast
51
52
  from .configuration_starcoder2 import Starcoder2Config
52
53
 
53
54
 
@@ -141,6 +142,7 @@ def eager_attention_forward(
141
142
  return attn_output, attn_weights
142
143
 
143
144
 
145
+ @use_kernelized_func(apply_rotary_pos_emb)
144
146
  class Starcoder2Attention(nn.Module):
145
147
  """Multi-headed attention from 'Attention Is All You Need' paper"""
146
148
 
@@ -157,7 +159,6 @@ class Starcoder2Attention(nn.Module):
157
159
  self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
158
160
  self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
159
161
  self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias)
160
- self.rotary_fn = apply_rotary_pos_emb
161
162
  self.residual_dropout = config.residual_dropout
162
163
 
163
164
  def forward(
@@ -288,7 +289,7 @@ class Starcoder2RotaryEmbedding(nn.Module):
288
289
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
289
290
 
290
291
  self.register_buffer("inv_freq", inv_freq, persistent=False)
291
- self.original_inv_freq = inv_freq
292
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
292
293
 
293
294
  @staticmethod
294
295
  def compute_default_rope_parameters(
@@ -327,7 +328,7 @@ class Starcoder2RotaryEmbedding(nn.Module):
327
328
  position_ids_expanded = position_ids[:, None, :].float()
328
329
 
329
330
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
330
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
331
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
331
332
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
332
333
  emb = torch.cat((freqs, freqs), dim=-1)
333
334
  cos = emb.cos() * self.attention_scaling
@@ -161,9 +161,8 @@ class SuperGlueImageProcessorFast(BaseImageProcessorFast):
161
161
  stacked_pairs = [torch.stack(pair, dim=0) for pair in image_pairs]
162
162
 
163
163
  # Return in same format as slow processor
164
- image_pairs = torch.stack(stacked_pairs, dim=0) if return_tensors else stacked_pairs
165
164
 
166
- return BatchFeature(data={"pixel_values": image_pairs})
165
+ return BatchFeature(data={"pixel_values": stacked_pairs}, tensor_type=return_tensors)
167
166
 
168
167
  def post_process_keypoint_matching(
169
168
  self,
@@ -670,6 +670,7 @@ class SuperGlueForKeypointMatching(SuperGluePreTrainedModel):
670
670
  output_attentions: Optional[bool] = None,
671
671
  output_hidden_states: Optional[bool] = None,
672
672
  return_dict: Optional[bool] = None,
673
+ **kwargs,
673
674
  ) -> Union[tuple, SuperGlueKeypointMatchingOutput]:
674
675
  r"""
675
676
  Examples:
@@ -110,8 +110,7 @@ class SuperPointImageProcessorFast(BaseImageProcessorFast):
110
110
  stacked_images = self.rescale(stacked_images, rescale_factor)
111
111
  processed_images_grouped[shape] = stacked_images
112
112
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
113
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
114
- return BatchFeature(data={"pixel_values": processed_images})
113
+ return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
115
114
 
116
115
  def post_process_keypoint_detection(
117
116
  self, outputs: "SuperPointKeypointDescriptionOutput", target_sizes: Union[TensorType, list[tuple]]
@@ -378,6 +378,7 @@ class SuperPointForKeypointDetection(SuperPointPreTrainedModel):
378
378
  labels: Optional[torch.LongTensor] = None,
379
379
  output_hidden_states: Optional[bool] = None,
380
380
  return_dict: Optional[bool] = None,
381
+ **kwargs,
381
382
  ) -> Union[tuple, SuperPointKeypointDescriptionOutput]:
382
383
  r"""
383
384
  Examples:
@@ -400,6 +400,10 @@ class SwiftFormerPreTrainedModel(PreTrainedModel):
400
400
  elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
401
401
  init.constant_(module.bias, 0)
402
402
  init.constant_(module.weight, 1.0)
403
+ if getattr(module, "running_mean", None) is not None:
404
+ init.zeros_(module.running_mean)
405
+ init.ones_(module.running_var)
406
+ init.zeros_(module.num_batches_tracked)
403
407
  elif isinstance(module, (SwiftFormerConvEncoder, SwiftFormerLocalRepresentation)):
404
408
  init.ones_(module.layer_scale)
405
409
  elif isinstance(module, SwiftFormerEncoderBlock):
@@ -428,6 +432,7 @@ class SwiftFormerModel(SwiftFormerPreTrainedModel):
428
432
  pixel_values: Optional[torch.Tensor] = None,
429
433
  output_hidden_states: Optional[bool] = None,
430
434
  return_dict: Optional[bool] = None,
435
+ **kwargs,
431
436
  ) -> Union[tuple, BaseModelOutputWithNoAttention]:
432
437
  output_hidden_states = (
433
438
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -478,6 +483,7 @@ class SwiftFormerForImageClassification(SwiftFormerPreTrainedModel):
478
483
  labels: Optional[torch.Tensor] = None,
479
484
  output_hidden_states: Optional[bool] = None,
480
485
  return_dict: Optional[bool] = None,
486
+ **kwargs,
481
487
  ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
482
488
  r"""
483
489
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -411,18 +411,7 @@ class SwinSelfAttention(nn.Module):
411
411
  torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
412
412
  )
413
413
 
414
- # get pair-wise relative position index for each token inside the window
415
- coords_h = torch.arange(self.window_size[0])
416
- coords_w = torch.arange(self.window_size[1])
417
- coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
418
- coords_flatten = torch.flatten(coords, 1)
419
- relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
420
- relative_coords = relative_coords.permute(1, 2, 0).contiguous()
421
- relative_coords[:, :, 0] += self.window_size[0] - 1
422
- relative_coords[:, :, 1] += self.window_size[1] - 1
423
- relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
424
- relative_position_index = relative_coords.sum(-1)
425
- self.register_buffer("relative_position_index", relative_position_index)
414
+ self.register_buffer("relative_position_index", self.create_relative_position_index())
426
415
 
427
416
  self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
428
417
  self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
@@ -481,6 +470,20 @@ class SwinSelfAttention(nn.Module):
481
470
 
482
471
  return outputs
483
472
 
473
+ def create_relative_position_index(self):
474
+ # get pair-wise relative position index for each token inside the window
475
+ coords_h = torch.arange(self.window_size[0])
476
+ coords_w = torch.arange(self.window_size[1])
477
+ coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
478
+ coords_flatten = torch.flatten(coords, 1)
479
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
480
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous()
481
+ relative_coords[:, :, 0] += self.window_size[0] - 1
482
+ relative_coords[:, :, 1] += self.window_size[1] - 1
483
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
484
+ relative_position_index = relative_coords.sum(-1)
485
+ return relative_position_index
486
+
484
487
 
485
488
  class SwinSelfOutput(nn.Module):
486
489
  def __init__(self, config, dim):
@@ -823,6 +826,7 @@ class SwinPreTrainedModel(PreTrainedModel):
823
826
  init.zeros_(module.position_embeddings)
824
827
  elif isinstance(module, SwinSelfAttention):
825
828
  init.zeros_(module.relative_position_bias_table)
829
+ init.copy_(module.relative_position_index, module.create_relative_position_index())
826
830
 
827
831
 
828
832
  @auto_docstring
@@ -860,6 +864,7 @@ class SwinModel(SwinPreTrainedModel):
860
864
  output_hidden_states: Optional[bool] = None,
861
865
  interpolate_pos_encoding: bool = False,
862
866
  return_dict: Optional[bool] = None,
867
+ **kwargs,
863
868
  ) -> Union[tuple, SwinModelOutput]:
864
869
  r"""
865
870
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
@@ -946,6 +951,7 @@ class SwinForMaskedImageModeling(SwinPreTrainedModel):
946
951
  output_hidden_states: Optional[bool] = None,
947
952
  interpolate_pos_encoding: bool = False,
948
953
  return_dict: Optional[bool] = None,
954
+ **kwargs,
949
955
  ) -> Union[tuple, SwinMaskedImageModelingOutput]:
950
956
  r"""
951
957
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
@@ -1059,6 +1065,7 @@ class SwinForImageClassification(SwinPreTrainedModel):
1059
1065
  output_hidden_states: Optional[bool] = None,
1060
1066
  interpolate_pos_encoding: bool = False,
1061
1067
  return_dict: Optional[bool] = None,
1068
+ **kwargs,
1062
1069
  ) -> Union[tuple, SwinImageClassifierOutput]:
1063
1070
  r"""
1064
1071
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1129,6 +1136,7 @@ class SwinBackbone(SwinPreTrainedModel, BackboneMixin):
1129
1136
  output_hidden_states: Optional[bool] = None,
1130
1137
  output_attentions: Optional[bool] = None,
1131
1138
  return_dict: Optional[bool] = None,
1139
+ **kwargs,
1132
1140
  ) -> BackboneOutput:
1133
1141
  """
1134
1142
  Returns:
@@ -97,7 +97,6 @@ class Swin2SRImageProcessorFast(BaseImageProcessorFast):
97
97
  stacked_images = self.pad(stacked_images, size_divisor=size_divisor)
98
98
  processed_image_grouped[shape] = stacked_images
99
99
  processed_images = reorder_images(processed_image_grouped, grouped_images_index)
100
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
101
100
 
102
101
  return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
103
102
 
@@ -250,40 +250,8 @@ class Swin2SRSelfAttention(nn.Module):
250
250
  nn.Linear(2, 512, bias=True), nn.ReLU(inplace=True), nn.Linear(512, num_heads, bias=False)
251
251
  )
252
252
 
253
- # get relative_coords_table
254
- relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.int64).float()
255
- relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.int64).float()
256
- relative_coords_table = (
257
- torch.stack(meshgrid([relative_coords_h, relative_coords_w], indexing="ij"))
258
- .permute(1, 2, 0)
259
- .contiguous()
260
- .unsqueeze(0)
261
- ) # [1, 2*window_height - 1, 2*window_width - 1, 2]
262
- if pretrained_window_size[0] > 0:
263
- relative_coords_table[:, :, :, 0] /= pretrained_window_size[0] - 1
264
- relative_coords_table[:, :, :, 1] /= pretrained_window_size[1] - 1
265
- elif window_size > 1:
266
- relative_coords_table[:, :, :, 0] /= self.window_size[0] - 1
267
- relative_coords_table[:, :, :, 1] /= self.window_size[1] - 1
268
- relative_coords_table *= 8 # normalize to -8, 8
269
- relative_coords_table = (
270
- torch.sign(relative_coords_table) * torch.log2(torch.abs(relative_coords_table) + 1.0) / math.log2(8)
271
- )
272
- # set to same dtype as mlp weight
273
- relative_coords_table = relative_coords_table.to(next(self.continuous_position_bias_mlp.parameters()).dtype)
253
+ relative_coords_table, relative_position_index = self.create_coords_table_and_index()
274
254
  self.register_buffer("relative_coords_table", relative_coords_table, persistent=False)
275
-
276
- # get pair-wise relative position index for each token inside the window
277
- coords_h = torch.arange(self.window_size[0])
278
- coords_w = torch.arange(self.window_size[1])
279
- coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
280
- coords_flatten = torch.flatten(coords, 1)
281
- relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
282
- relative_coords = relative_coords.permute(1, 2, 0).contiguous()
283
- relative_coords[:, :, 0] += self.window_size[0] - 1
284
- relative_coords[:, :, 1] += self.window_size[1] - 1
285
- relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
286
- relative_position_index = relative_coords.sum(-1)
287
255
  self.register_buffer("relative_position_index", relative_position_index, persistent=False)
288
256
 
289
257
  self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
@@ -359,6 +327,43 @@ class Swin2SRSelfAttention(nn.Module):
359
327
 
360
328
  return outputs
361
329
 
330
+ def create_coords_table_and_index(self):
331
+ # get relative_coords_table
332
+ relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.int64).float()
333
+ relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.int64).float()
334
+ relative_coords_table = (
335
+ torch.stack(meshgrid([relative_coords_h, relative_coords_w], indexing="ij"))
336
+ .permute(1, 2, 0)
337
+ .contiguous()
338
+ .unsqueeze(0)
339
+ ) # [1, 2*window_height - 1, 2*window_width - 1, 2]
340
+ if self.pretrained_window_size[0] > 0:
341
+ relative_coords_table[:, :, :, 0] /= self.pretrained_window_size[0] - 1
342
+ relative_coords_table[:, :, :, 1] /= self.pretrained_window_size[1] - 1
343
+ elif self.window_size[0] > 1:
344
+ relative_coords_table[:, :, :, 0] /= self.window_size[0] - 1
345
+ relative_coords_table[:, :, :, 1] /= self.window_size[1] - 1
346
+ relative_coords_table *= 8 # normalize to -8, 8
347
+ relative_coords_table = (
348
+ torch.sign(relative_coords_table) * torch.log2(torch.abs(relative_coords_table) + 1.0) / math.log2(8)
349
+ )
350
+ # set to same dtype as mlp weight
351
+ relative_coords_table = relative_coords_table.to(next(self.continuous_position_bias_mlp.parameters()).dtype)
352
+
353
+ # get pair-wise relative position index for each token inside the window
354
+ coords_h = torch.arange(self.window_size[0])
355
+ coords_w = torch.arange(self.window_size[1])
356
+ coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
357
+ coords_flatten = torch.flatten(coords, 1)
358
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
359
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous()
360
+ relative_coords[:, :, 0] += self.window_size[0] - 1
361
+ relative_coords[:, :, 1] += self.window_size[1] - 1
362
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
363
+ relative_position_index = relative_coords.sum(-1)
364
+
365
+ return relative_coords_table, relative_position_index
366
+
362
367
 
363
368
  # Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->Swin2SR
364
369
  class Swin2SRSelfOutput(nn.Module):
@@ -702,6 +707,17 @@ class Swin2SRPreTrainedModel(PreTrainedModel):
702
707
  elif isinstance(module, nn.LayerNorm):
703
708
  init.zeros_(module.bias)
704
709
  init.ones_(module.weight)
710
+ elif isinstance(module, Swin2SRSelfAttention):
711
+ init.constant_(module.logit_scale, math.log(10))
712
+ relative_coords_table, relative_position_index = module.create_coords_table_and_index()
713
+ init.copy_(module.relative_coords_table, relative_coords_table)
714
+ init.copy_(module.relative_position_index, relative_position_index)
715
+ elif isinstance(module, Swin2SRModel):
716
+ if module.config.num_channels == 3 and module.config.num_channels_out == 3:
717
+ mean = torch.tensor([0.4488, 0.4371, 0.4040]).view(1, 3, 1, 1)
718
+ else:
719
+ mean = torch.zeros(1, 1, 1, 1)
720
+ init.copy_(module.mean, mean)
705
721
 
706
722
 
707
723
  @auto_docstring
@@ -754,6 +770,7 @@ class Swin2SRModel(Swin2SRPreTrainedModel):
754
770
  output_attentions: Optional[bool] = None,
755
771
  output_hidden_states: Optional[bool] = None,
756
772
  return_dict: Optional[bool] = None,
773
+ **kwargs,
757
774
  ) -> Union[tuple, BaseModelOutput]:
758
775
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
759
776
  output_hidden_states = (
@@ -972,6 +989,7 @@ class Swin2SRForImageSuperResolution(Swin2SRPreTrainedModel):
972
989
  output_attentions: Optional[bool] = None,
973
990
  output_hidden_states: Optional[bool] = None,
974
991
  return_dict: Optional[bool] = None,
992
+ **kwargs,
975
993
  ) -> Union[tuple, ImageSuperResolutionOutput]:
976
994
  r"""
977
995
  Example:
@@ -421,40 +421,8 @@ class Swinv2SelfAttention(nn.Module):
421
421
  nn.Linear(2, 512, bias=True), nn.ReLU(inplace=True), nn.Linear(512, num_heads, bias=False)
422
422
  )
423
423
 
424
- # get relative_coords_table
425
- relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.int64).float()
426
- relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.int64).float()
427
- relative_coords_table = (
428
- torch.stack(meshgrid([relative_coords_h, relative_coords_w], indexing="ij"))
429
- .permute(1, 2, 0)
430
- .contiguous()
431
- .unsqueeze(0)
432
- ) # [1, 2*window_height - 1, 2*window_width - 1, 2]
433
- if pretrained_window_size[0] > 0:
434
- relative_coords_table[:, :, :, 0] /= pretrained_window_size[0] - 1
435
- relative_coords_table[:, :, :, 1] /= pretrained_window_size[1] - 1
436
- elif window_size > 1:
437
- relative_coords_table[:, :, :, 0] /= self.window_size[0] - 1
438
- relative_coords_table[:, :, :, 1] /= self.window_size[1] - 1
439
- relative_coords_table *= 8 # normalize to -8, 8
440
- relative_coords_table = (
441
- torch.sign(relative_coords_table) * torch.log2(torch.abs(relative_coords_table) + 1.0) / math.log2(8)
442
- )
443
- # set to same dtype as mlp weight
444
- relative_coords_table = relative_coords_table.to(next(self.continuous_position_bias_mlp.parameters()).dtype)
424
+ relative_coords_table, relative_position_index = self.create_coords_table_and_index()
445
425
  self.register_buffer("relative_coords_table", relative_coords_table, persistent=False)
446
-
447
- # get pair-wise relative position index for each token inside the window
448
- coords_h = torch.arange(self.window_size[0])
449
- coords_w = torch.arange(self.window_size[1])
450
- coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
451
- coords_flatten = torch.flatten(coords, 1)
452
- relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
453
- relative_coords = relative_coords.permute(1, 2, 0).contiguous()
454
- relative_coords[:, :, 0] += self.window_size[0] - 1
455
- relative_coords[:, :, 1] += self.window_size[1] - 1
456
- relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
457
- relative_position_index = relative_coords.sum(-1)
458
426
  self.register_buffer("relative_position_index", relative_position_index, persistent=False)
459
427
 
460
428
  self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
@@ -530,6 +498,43 @@ class Swinv2SelfAttention(nn.Module):
530
498
 
531
499
  return outputs
532
500
 
501
+ def create_coords_table_and_index(self):
502
+ # get relative_coords_table
503
+ relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.int64).float()
504
+ relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.int64).float()
505
+ relative_coords_table = (
506
+ torch.stack(meshgrid([relative_coords_h, relative_coords_w], indexing="ij"))
507
+ .permute(1, 2, 0)
508
+ .contiguous()
509
+ .unsqueeze(0)
510
+ ) # [1, 2*window_height - 1, 2*window_width - 1, 2]
511
+ if self.pretrained_window_size[0] > 0:
512
+ relative_coords_table[:, :, :, 0] /= self.pretrained_window_size[0] - 1
513
+ relative_coords_table[:, :, :, 1] /= self.pretrained_window_size[1] - 1
514
+ elif self.window_size[0] > 1:
515
+ relative_coords_table[:, :, :, 0] /= self.window_size[0] - 1
516
+ relative_coords_table[:, :, :, 1] /= self.window_size[1] - 1
517
+ relative_coords_table *= 8 # normalize to -8, 8
518
+ relative_coords_table = (
519
+ torch.sign(relative_coords_table) * torch.log2(torch.abs(relative_coords_table) + 1.0) / math.log2(8)
520
+ )
521
+ # set to same dtype as mlp weight
522
+ relative_coords_table = relative_coords_table.to(next(self.continuous_position_bias_mlp.parameters()).dtype)
523
+
524
+ # get pair-wise relative position index for each token inside the window
525
+ coords_h = torch.arange(self.window_size[0])
526
+ coords_w = torch.arange(self.window_size[1])
527
+ coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
528
+ coords_flatten = torch.flatten(coords, 1)
529
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
530
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous()
531
+ relative_coords[:, :, 0] += self.window_size[0] - 1
532
+ relative_coords[:, :, 1] += self.window_size[1] - 1
533
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
534
+ relative_position_index = relative_coords.sum(-1)
535
+
536
+ return relative_coords_table, relative_position_index
537
+
533
538
 
534
539
  # Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->Swinv2
535
540
  class Swinv2SelfOutput(nn.Module):
@@ -904,6 +909,9 @@ class Swinv2PreTrainedModel(PreTrainedModel):
904
909
  init.zeros_(module.position_embeddings)
905
910
  elif isinstance(module, Swinv2SelfAttention):
906
911
  init.constant_(module.logit_scale, math.log(10))
912
+ relative_coords_table, relative_position_index = module.create_coords_table_and_index()
913
+ init.copy_(module.relative_coords_table, relative_coords_table)
914
+ init.copy_(module.relative_position_index, relative_position_index)
907
915
 
908
916
 
909
917
  @auto_docstring
@@ -942,6 +950,7 @@ class Swinv2Model(Swinv2PreTrainedModel):
942
950
  output_hidden_states: Optional[bool] = None,
943
951
  interpolate_pos_encoding: bool = False,
944
952
  return_dict: Optional[bool] = None,
953
+ **kwargs,
945
954
  ) -> Union[tuple, Swinv2ModelOutput]:
946
955
  r"""
947
956
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
@@ -1030,6 +1039,7 @@ class Swinv2ForMaskedImageModeling(Swinv2PreTrainedModel):
1030
1039
  output_hidden_states: Optional[bool] = None,
1031
1040
  interpolate_pos_encoding: bool = False,
1032
1041
  return_dict: Optional[bool] = None,
1042
+ **kwargs,
1033
1043
  ) -> Union[tuple, Swinv2MaskedImageModelingOutput]:
1034
1044
  r"""
1035
1045
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
@@ -1144,6 +1154,7 @@ class Swinv2ForImageClassification(Swinv2PreTrainedModel):
1144
1154
  output_hidden_states: Optional[bool] = None,
1145
1155
  interpolate_pos_encoding: bool = False,
1146
1156
  return_dict: Optional[bool] = None,
1157
+ **kwargs,
1147
1158
  ) -> Union[tuple, Swinv2ImageClassifierOutput]:
1148
1159
  r"""
1149
1160
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1209,6 +1220,7 @@ class Swinv2Backbone(Swinv2PreTrainedModel, BackboneMixin):
1209
1220
  output_attentions: Optional[bool] = None,
1210
1221
  output_hidden_states: Optional[bool] = None,
1211
1222
  return_dict: Optional[bool] = None,
1223
+ **kwargs,
1212
1224
  ) -> BackboneOutput:
1213
1225
  r"""
1214
1226
  Examples: