transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. transformers/__init__.py +30 -3
  2. transformers/cli/serve.py +47 -17
  3. transformers/conversion_mapping.py +15 -2
  4. transformers/convert_slow_tokenizer.py +225 -10
  5. transformers/core_model_loading.py +196 -135
  6. transformers/data/data_collator.py +12 -4
  7. transformers/dependency_versions_table.py +1 -2
  8. transformers/dynamic_module_utils.py +1 -2
  9. transformers/feature_extraction_utils.py +1 -2
  10. transformers/file_utils.py +0 -1
  11. transformers/generation/__init__.py +11 -1
  12. transformers/generation/configuration_utils.py +3 -2
  13. transformers/generation/continuous_batching/__init__.py +4 -0
  14. transformers/generation/continuous_batching/continuous_api.py +134 -79
  15. transformers/image_processing_base.py +1 -2
  16. transformers/integrations/__init__.py +4 -2
  17. transformers/integrations/accelerate.py +15 -3
  18. transformers/integrations/aqlm.py +38 -66
  19. transformers/integrations/awq.py +48 -514
  20. transformers/integrations/bitnet.py +45 -100
  21. transformers/integrations/bitsandbytes.py +79 -191
  22. transformers/integrations/deepspeed.py +1 -0
  23. transformers/integrations/eetq.py +84 -79
  24. transformers/integrations/fbgemm_fp8.py +191 -145
  25. transformers/integrations/finegrained_fp8.py +236 -193
  26. transformers/integrations/fp_quant.py +92 -0
  27. transformers/integrations/ggml.py +11 -1
  28. transformers/integrations/higgs.py +40 -62
  29. transformers/integrations/hub_kernels.py +42 -3
  30. transformers/integrations/integration_utils.py +10 -0
  31. transformers/integrations/mxfp4.py +25 -65
  32. transformers/integrations/peft.py +7 -29
  33. transformers/integrations/quanto.py +73 -55
  34. transformers/integrations/quark.py +55 -0
  35. transformers/integrations/spqr.py +44 -90
  36. transformers/integrations/torchao.py +32 -38
  37. transformers/integrations/vptq.py +42 -59
  38. transformers/modelcard.py +1 -2
  39. transformers/modeling_gguf_pytorch_utils.py +8 -0
  40. transformers/modeling_rope_utils.py +30 -6
  41. transformers/modeling_utils.py +116 -112
  42. transformers/models/__init__.py +3 -0
  43. transformers/models/afmoe/modeling_afmoe.py +4 -4
  44. transformers/models/albert/tokenization_albert.py +6 -12
  45. transformers/models/align/modeling_align.py +2 -0
  46. transformers/models/altclip/modeling_altclip.py +4 -0
  47. transformers/models/apertus/modeling_apertus.py +4 -4
  48. transformers/models/arcee/modeling_arcee.py +4 -4
  49. transformers/models/aria/modeling_aria.py +4 -4
  50. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  51. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  52. transformers/models/auto/configuration_auto.py +11 -0
  53. transformers/models/auto/feature_extraction_auto.py +2 -0
  54. transformers/models/auto/image_processing_auto.py +1 -0
  55. transformers/models/auto/modeling_auto.py +6 -0
  56. transformers/models/auto/processing_auto.py +18 -10
  57. transformers/models/auto/tokenization_auto.py +74 -472
  58. transformers/models/autoformer/modeling_autoformer.py +4 -0
  59. transformers/models/bamba/modeling_bamba.py +4 -3
  60. transformers/models/bark/modeling_bark.py +2 -0
  61. transformers/models/bart/modeling_bart.py +7 -0
  62. transformers/models/barthez/tokenization_barthez.py +5 -10
  63. transformers/models/beit/modeling_beit.py +6 -1
  64. transformers/models/bert/tokenization_bert.py +8 -21
  65. transformers/models/big_bird/modeling_big_bird.py +6 -0
  66. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  67. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
  68. transformers/models/biogpt/modeling_biogpt.py +2 -0
  69. transformers/models/biogpt/modular_biogpt.py +2 -0
  70. transformers/models/bit/modeling_bit.py +11 -2
  71. transformers/models/bitnet/modeling_bitnet.py +4 -4
  72. transformers/models/blenderbot/modeling_blenderbot.py +5 -0
  73. transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
  74. transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
  75. transformers/models/blip/modeling_blip_text.py +2 -0
  76. transformers/models/blip_2/modeling_blip_2.py +2 -1
  77. transformers/models/bloom/modeling_bloom.py +4 -0
  78. transformers/models/blt/modeling_blt.py +2 -2
  79. transformers/models/blt/modular_blt.py +2 -2
  80. transformers/models/bridgetower/modeling_bridgetower.py +5 -1
  81. transformers/models/bros/modeling_bros.py +4 -0
  82. transformers/models/camembert/tokenization_camembert.py +8 -12
  83. transformers/models/canine/modeling_canine.py +5 -0
  84. transformers/models/chameleon/modeling_chameleon.py +2 -1
  85. transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
  86. transformers/models/clap/modeling_clap.py +5 -0
  87. transformers/models/clip/tokenization_clip.py +22 -44
  88. transformers/models/clipseg/modeling_clipseg.py +5 -0
  89. transformers/models/clvp/modeling_clvp.py +5 -0
  90. transformers/models/clvp/tokenization_clvp.py +1 -63
  91. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  92. transformers/models/codegen/tokenization_codegen.py +14 -43
  93. transformers/models/cohere/modeling_cohere.py +4 -3
  94. transformers/models/cohere/modular_cohere.py +2 -1
  95. transformers/models/cohere/tokenization_cohere.py +12 -42
  96. transformers/models/cohere2/modeling_cohere2.py +7 -6
  97. transformers/models/cohere2/modular_cohere2.py +5 -5
  98. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
  99. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  100. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  101. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  102. transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
  103. transformers/models/convbert/modeling_convbert.py +6 -0
  104. transformers/models/convnext/modeling_convnext.py +2 -4
  105. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  106. transformers/models/csm/modeling_csm.py +4 -3
  107. transformers/models/ctrl/modeling_ctrl.py +1 -0
  108. transformers/models/cvt/modeling_cvt.py +2 -0
  109. transformers/models/cwm/modeling_cwm.py +4 -4
  110. transformers/models/d_fine/modeling_d_fine.py +2 -0
  111. transformers/models/d_fine/modular_d_fine.py +1 -0
  112. transformers/models/dab_detr/modeling_dab_detr.py +4 -0
  113. transformers/models/dac/modeling_dac.py +2 -2
  114. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  115. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  116. transformers/models/dbrx/modeling_dbrx.py +2 -2
  117. transformers/models/deberta/modeling_deberta.py +5 -0
  118. transformers/models/deberta/tokenization_deberta.py +11 -20
  119. transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
  120. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  121. transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
  122. transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
  123. transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
  124. transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
  125. transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
  126. transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
  127. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  128. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  129. transformers/models/detr/modeling_detr.py +5 -0
  130. transformers/models/dia/modeling_dia.py +4 -3
  131. transformers/models/dia/modular_dia.py +0 -1
  132. transformers/models/diffllama/modeling_diffllama.py +2 -2
  133. transformers/models/dinat/modeling_dinat.py +3 -0
  134. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  135. transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
  136. transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
  137. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  138. transformers/models/doge/modeling_doge.py +2 -3
  139. transformers/models/doge/modular_doge.py +0 -1
  140. transformers/models/donut/modeling_donut_swin.py +2 -0
  141. transformers/models/dots1/modeling_dots1.py +10 -7
  142. transformers/models/dots1/modular_dots1.py +5 -3
  143. transformers/models/dpr/modeling_dpr.py +5 -0
  144. transformers/models/dpr/tokenization_dpr.py +12 -0
  145. transformers/models/edgetam/modeling_edgetam.py +1 -1
  146. transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
  147. transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
  148. transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
  149. transformers/models/efficientnet/modeling_efficientnet.py +2 -0
  150. transformers/models/emu3/modeling_emu3.py +4 -4
  151. transformers/models/eomt/image_processing_eomt.py +13 -1
  152. transformers/models/eomt/image_processing_eomt_fast.py +14 -2
  153. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  154. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  155. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
  156. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
  157. transformers/models/esm/modeling_esmfold.py +5 -4
  158. transformers/models/evolla/modeling_evolla.py +4 -4
  159. transformers/models/exaone4/modeling_exaone4.py +2 -2
  160. transformers/models/exaone4/modular_exaone4.py +0 -1
  161. transformers/models/falcon/modeling_falcon.py +6 -1
  162. transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
  163. transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
  164. transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
  165. transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
  166. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  167. transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
  168. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  169. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
  170. transformers/models/flaubert/modeling_flaubert.py +7 -0
  171. transformers/models/flava/modeling_flava.py +6 -1
  172. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
  173. transformers/models/florence2/modeling_florence2.py +2 -1
  174. transformers/models/florence2/modular_florence2.py +2 -1
  175. transformers/models/fnet/modeling_fnet.py +7 -0
  176. transformers/models/focalnet/modeling_focalnet.py +4 -0
  177. transformers/models/fsmt/modeling_fsmt.py +2 -0
  178. transformers/models/funnel/modeling_funnel.py +8 -0
  179. transformers/models/funnel/tokenization_funnel.py +17 -24
  180. transformers/models/fuyu/processing_fuyu.py +3 -3
  181. transformers/models/gemma/modeling_gemma.py +4 -4
  182. transformers/models/gemma/tokenization_gemma.py +10 -27
  183. transformers/models/gemma2/modeling_gemma2.py +4 -4
  184. transformers/models/gemma2/modular_gemma2.py +2 -1
  185. transformers/models/gemma3/modeling_gemma3.py +14 -84
  186. transformers/models/gemma3/modular_gemma3.py +12 -81
  187. transformers/models/gemma3n/modeling_gemma3n.py +18 -209
  188. transformers/models/gemma3n/modular_gemma3n.py +17 -59
  189. transformers/models/git/modeling_git.py +2 -0
  190. transformers/models/glm/modeling_glm.py +4 -4
  191. transformers/models/glm4/modeling_glm4.py +4 -4
  192. transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
  193. transformers/models/glm4v/configuration_glm4v.py +3 -1
  194. transformers/models/glm4v/modeling_glm4v.py +3 -3
  195. transformers/models/glm4v/modular_glm4v.py +6 -4
  196. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  197. transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
  198. transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
  199. transformers/models/glpn/modeling_glpn.py +2 -0
  200. transformers/models/gpt2/modeling_gpt2.py +5 -1
  201. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  202. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
  203. transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
  204. transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
  205. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  206. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  207. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
  208. transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
  209. transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
  210. transformers/models/gptj/modeling_gptj.py +3 -0
  211. transformers/models/granite/modeling_granite.py +4 -4
  212. transformers/models/granitemoe/modeling_granitemoe.py +4 -6
  213. transformers/models/granitemoe/modular_granitemoe.py +0 -2
  214. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
  215. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
  216. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
  217. transformers/models/groupvit/modeling_groupvit.py +3 -0
  218. transformers/models/helium/modeling_helium.py +4 -3
  219. transformers/models/herbert/tokenization_herbert.py +9 -25
  220. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
  221. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
  222. transformers/models/hiera/modeling_hiera.py +4 -0
  223. transformers/models/hubert/modeling_hubert.py +3 -0
  224. transformers/models/hubert/modular_hubert.py +1 -0
  225. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
  226. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
  227. transformers/models/ibert/modeling_ibert.py +6 -0
  228. transformers/models/idefics/modeling_idefics.py +5 -21
  229. transformers/models/imagegpt/modeling_imagegpt.py +2 -1
  230. transformers/models/informer/modeling_informer.py +4 -0
  231. transformers/models/informer/modular_informer.py +1 -0
  232. transformers/models/internvl/modeling_internvl.py +2 -4
  233. transformers/models/internvl/modular_internvl.py +2 -4
  234. transformers/models/jamba/modeling_jamba.py +2 -2
  235. transformers/models/janus/modeling_janus.py +1 -0
  236. transformers/models/janus/modular_janus.py +1 -0
  237. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  238. transformers/models/kosmos2/modeling_kosmos2.py +1 -0
  239. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
  240. transformers/models/lasr/__init__.py +29 -0
  241. transformers/models/lasr/configuration_lasr.py +244 -0
  242. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  243. transformers/models/lasr/modeling_lasr.py +729 -0
  244. transformers/models/lasr/modular_lasr.py +569 -0
  245. transformers/models/lasr/processing_lasr.py +96 -0
  246. transformers/models/lasr/tokenization_lasr.py +186 -0
  247. transformers/models/layoutlm/modeling_layoutlm.py +5 -0
  248. transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
  249. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
  250. transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
  251. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  252. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  253. transformers/models/led/modeling_led.py +6 -0
  254. transformers/models/levit/modeling_levit.py +3 -0
  255. transformers/models/lfm2/modeling_lfm2.py +4 -5
  256. transformers/models/lfm2/modular_lfm2.py +0 -1
  257. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
  258. transformers/models/lightglue/modeling_lightglue.py +3 -1
  259. transformers/models/lightglue/modular_lightglue.py +1 -0
  260. transformers/models/lilt/modeling_lilt.py +4 -0
  261. transformers/models/llama/modeling_llama.py +4 -4
  262. transformers/models/llama/tokenization_llama.py +15 -43
  263. transformers/models/llama4/modeling_llama4.py +3 -2
  264. transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
  265. transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
  266. transformers/models/longformer/modeling_longformer.py +6 -0
  267. transformers/models/longt5/modeling_longt5.py +4 -0
  268. transformers/models/luke/modeling_luke.py +9 -0
  269. transformers/models/luke/tokenization_luke.py +11 -38
  270. transformers/models/lxmert/modeling_lxmert.py +2 -0
  271. transformers/models/m2m_100/modeling_m2m_100.py +4 -0
  272. transformers/models/mamba/modeling_mamba.py +14 -22
  273. transformers/models/marian/modeling_marian.py +5 -0
  274. transformers/models/markuplm/modeling_markuplm.py +4 -0
  275. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  276. transformers/models/mask2former/modeling_mask2former.py +2 -0
  277. transformers/models/maskformer/modeling_maskformer.py +2 -0
  278. transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
  279. transformers/models/mbart/modeling_mbart.py +7 -0
  280. transformers/models/mbart/tokenization_mbart.py +11 -52
  281. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  282. transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
  283. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  284. transformers/models/mimi/modeling_mimi.py +3 -1
  285. transformers/models/minimax/modeling_minimax.py +4 -4
  286. transformers/models/ministral/modeling_ministral.py +4 -4
  287. transformers/models/ministral3/configuration_ministral3.py +1 -1
  288. transformers/models/ministral3/modeling_ministral3.py +4 -3
  289. transformers/models/mistral/modeling_mistral.py +4 -3
  290. transformers/models/mixtral/modeling_mixtral.py +4 -4
  291. transformers/models/mllama/modeling_mllama.py +2 -2
  292. transformers/models/mluke/tokenization_mluke.py +6 -6
  293. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
  294. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  295. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  296. transformers/models/mobilevit/modeling_mobilevit.py +3 -0
  297. transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
  298. transformers/models/modernbert/modeling_modernbert.py +4 -1
  299. transformers/models/modernbert/modular_modernbert.py +2 -0
  300. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
  301. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
  302. transformers/models/moonshine/modeling_moonshine.py +4 -2
  303. transformers/models/moshi/modeling_moshi.py +5 -2
  304. transformers/models/mpnet/modeling_mpnet.py +5 -0
  305. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  306. transformers/models/mpt/modeling_mpt.py +2 -0
  307. transformers/models/mra/modeling_mra.py +6 -0
  308. transformers/models/mt5/modeling_mt5.py +7 -0
  309. transformers/models/musicgen/modeling_musicgen.py +2 -0
  310. transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
  311. transformers/models/mvp/modeling_mvp.py +7 -0
  312. transformers/models/nanochat/modeling_nanochat.py +4 -4
  313. transformers/models/nemotron/modeling_nemotron.py +4 -2
  314. transformers/models/nllb/tokenization_nllb.py +8 -22
  315. transformers/models/nougat/tokenization_nougat.py +11 -59
  316. transformers/models/nystromformer/modeling_nystromformer.py +6 -0
  317. transformers/models/olmo/modeling_olmo.py +4 -4
  318. transformers/models/olmo/modular_olmo.py +2 -2
  319. transformers/models/olmo2/modeling_olmo2.py +4 -5
  320. transformers/models/olmo2/modular_olmo2.py +0 -1
  321. transformers/models/olmo3/modeling_olmo3.py +4 -4
  322. transformers/models/olmoe/modeling_olmoe.py +4 -4
  323. transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
  324. transformers/models/oneformer/modeling_oneformer.py +4 -1
  325. transformers/models/openai/modeling_openai.py +3 -0
  326. transformers/models/openai/tokenization_openai.py +10 -46
  327. transformers/models/opt/modeling_opt.py +2 -0
  328. transformers/models/owlv2/modeling_owlv2.py +4 -0
  329. transformers/models/owlvit/modeling_owlvit.py +4 -0
  330. transformers/models/paddleocr_vl/__init__.py +32 -0
  331. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  332. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
  333. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  334. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
  335. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
  336. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  337. transformers/models/parakeet/configuration_parakeet.py +4 -6
  338. transformers/models/parakeet/modeling_parakeet.py +9 -6
  339. transformers/models/parakeet/modular_parakeet.py +2 -2
  340. transformers/models/parakeet/processing_parakeet.py +1 -0
  341. transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
  342. transformers/models/patchtst/modeling_patchtst.py +20 -2
  343. transformers/models/pegasus/modeling_pegasus.py +5 -0
  344. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  345. transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
  346. transformers/models/perceiver/modeling_perceiver.py +8 -0
  347. transformers/models/persimmon/modeling_persimmon.py +2 -1
  348. transformers/models/phi/modeling_phi.py +4 -5
  349. transformers/models/phi/modular_phi.py +0 -1
  350. transformers/models/phi3/modeling_phi3.py +2 -1
  351. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
  352. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
  353. transformers/models/phimoe/modeling_phimoe.py +4 -4
  354. transformers/models/phimoe/modular_phimoe.py +2 -2
  355. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  356. transformers/models/pixtral/modeling_pixtral.py +2 -1
  357. transformers/models/plbart/modeling_plbart.py +6 -0
  358. transformers/models/plbart/modular_plbart.py +2 -0
  359. transformers/models/plbart/tokenization_plbart.py +0 -2
  360. transformers/models/poolformer/modeling_poolformer.py +2 -0
  361. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  362. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  363. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  364. transformers/models/prophetnet/modeling_prophetnet.py +3 -0
  365. transformers/models/pvt/modeling_pvt.py +2 -0
  366. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  367. transformers/models/qwen2/modeling_qwen2.py +4 -4
  368. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  369. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  370. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
  371. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
  372. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  373. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
  374. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
  375. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
  376. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  377. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  378. transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
  379. transformers/models/qwen3/modeling_qwen3.py +4 -4
  380. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  381. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
  382. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
  383. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
  384. transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
  385. transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
  386. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
  387. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
  388. transformers/models/rag/modeling_rag.py +1 -0
  389. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
  390. transformers/models/reformer/modeling_reformer.py +4 -0
  391. transformers/models/reformer/tokenization_reformer.py +11 -28
  392. transformers/models/regnet/modeling_regnet.py +6 -1
  393. transformers/models/rembert/modeling_rembert.py +6 -0
  394. transformers/models/rembert/tokenization_rembert.py +3 -10
  395. transformers/models/resnet/modeling_resnet.py +11 -2
  396. transformers/models/roberta/tokenization_roberta.py +18 -27
  397. transformers/models/roformer/modeling_roformer.py +6 -0
  398. transformers/models/roformer/tokenization_roformer.py +77 -412
  399. transformers/models/rt_detr/modeling_rt_detr.py +2 -0
  400. transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
  401. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
  402. transformers/models/rwkv/modeling_rwkv.py +1 -0
  403. transformers/models/sam2/modeling_sam2.py +2 -2
  404. transformers/models/sam2/modular_sam2.py +2 -2
  405. transformers/models/sam2_video/modeling_sam2_video.py +1 -0
  406. transformers/models/sam2_video/modular_sam2_video.py +1 -0
  407. transformers/models/sam3/modeling_sam3.py +77 -80
  408. transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
  409. transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
  410. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
  411. transformers/models/sam3_video/modeling_sam3_video.py +1 -0
  412. transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
  413. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  414. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
  415. transformers/models/seed_oss/modeling_seed_oss.py +2 -2
  416. transformers/models/segformer/modeling_segformer.py +4 -1
  417. transformers/models/seggpt/modeling_seggpt.py +2 -0
  418. transformers/models/sew/modeling_sew.py +3 -0
  419. transformers/models/sew/modular_sew.py +1 -0
  420. transformers/models/sew_d/modeling_sew_d.py +3 -0
  421. transformers/models/siglip2/modeling_siglip2.py +4 -0
  422. transformers/models/siglip2/modular_siglip2.py +4 -0
  423. transformers/models/smollm3/modeling_smollm3.py +4 -4
  424. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  425. transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
  426. transformers/models/speecht5/modeling_speecht5.py +13 -1
  427. transformers/models/splinter/modeling_splinter.py +3 -0
  428. transformers/models/splinter/tokenization_splinter.py +9 -28
  429. transformers/models/squeezebert/modeling_squeezebert.py +6 -0
  430. transformers/models/stablelm/modeling_stablelm.py +3 -1
  431. transformers/models/starcoder2/modeling_starcoder2.py +4 -3
  432. transformers/models/superglue/modeling_superglue.py +1 -0
  433. transformers/models/superpoint/modeling_superpoint.py +1 -0
  434. transformers/models/swiftformer/modeling_swiftformer.py +2 -0
  435. transformers/models/swin/modeling_swin.py +4 -0
  436. transformers/models/swin2sr/modeling_swin2sr.py +2 -0
  437. transformers/models/swinv2/modeling_swinv2.py +4 -0
  438. transformers/models/t5/modeling_t5.py +7 -0
  439. transformers/models/t5/tokenization_t5.py +4 -8
  440. transformers/models/t5gemma/modeling_t5gemma.py +5 -5
  441. transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
  442. transformers/models/table_transformer/modeling_table_transformer.py +4 -0
  443. transformers/models/tapas/modeling_tapas.py +3 -0
  444. transformers/models/textnet/modeling_textnet.py +11 -2
  445. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  446. transformers/models/timesfm/modeling_timesfm.py +2 -0
  447. transformers/models/timesfm/modular_timesfm.py +2 -0
  448. transformers/models/timesformer/modeling_timesformer.py +2 -0
  449. transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
  450. transformers/models/trocr/modeling_trocr.py +2 -0
  451. transformers/models/tvp/modeling_tvp.py +2 -0
  452. transformers/models/udop/modeling_udop.py +4 -0
  453. transformers/models/udop/tokenization_udop.py +5 -13
  454. transformers/models/umt5/modeling_umt5.py +7 -0
  455. transformers/models/unispeech/modeling_unispeech.py +4 -0
  456. transformers/models/unispeech/modular_unispeech.py +2 -0
  457. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  458. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  459. transformers/models/univnet/modeling_univnet.py +1 -0
  460. transformers/models/upernet/modeling_upernet.py +1 -0
  461. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  462. transformers/models/vilt/modeling_vilt.py +6 -0
  463. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  464. transformers/models/visual_bert/modeling_visual_bert.py +6 -0
  465. transformers/models/vitdet/modeling_vitdet.py +2 -0
  466. transformers/models/vitmatte/modeling_vitmatte.py +1 -0
  467. transformers/models/vits/modeling_vits.py +1 -0
  468. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  469. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  470. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
  471. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
  472. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
  473. transformers/models/wavlm/modeling_wavlm.py +5 -0
  474. transformers/models/whisper/modeling_whisper.py +6 -0
  475. transformers/models/whisper/tokenization_whisper.py +4 -15
  476. transformers/models/x_clip/modeling_x_clip.py +3 -0
  477. transformers/models/xglm/modeling_xglm.py +1 -0
  478. transformers/models/xglm/tokenization_xglm.py +4 -9
  479. transformers/models/xlm/modeling_xlm.py +5 -0
  480. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  481. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  482. transformers/models/yoso/modeling_yoso.py +6 -0
  483. transformers/models/zamba/modeling_zamba.py +2 -0
  484. transformers/models/zamba2/modeling_zamba2.py +4 -2
  485. transformers/models/zamba2/modular_zamba2.py +1 -1
  486. transformers/models/zoedepth/modeling_zoedepth.py +1 -0
  487. transformers/pipelines/__init__.py +2 -3
  488. transformers/pipelines/base.py +1 -9
  489. transformers/pipelines/document_question_answering.py +3 -1
  490. transformers/pipelines/text_generation.py +1 -1
  491. transformers/processing_utils.py +23 -11
  492. transformers/quantizers/base.py +35 -110
  493. transformers/quantizers/quantizer_aqlm.py +1 -5
  494. transformers/quantizers/quantizer_auto_round.py +1 -2
  495. transformers/quantizers/quantizer_awq.py +17 -81
  496. transformers/quantizers/quantizer_bitnet.py +3 -8
  497. transformers/quantizers/quantizer_bnb_4bit.py +13 -110
  498. transformers/quantizers/quantizer_bnb_8bit.py +16 -92
  499. transformers/quantizers/quantizer_compressed_tensors.py +1 -5
  500. transformers/quantizers/quantizer_eetq.py +14 -62
  501. transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
  502. transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
  503. transformers/quantizers/quantizer_fp_quant.py +48 -78
  504. transformers/quantizers/quantizer_gptq.py +7 -24
  505. transformers/quantizers/quantizer_higgs.py +40 -54
  506. transformers/quantizers/quantizer_hqq.py +144 -153
  507. transformers/quantizers/quantizer_mxfp4.py +13 -167
  508. transformers/quantizers/quantizer_quanto.py +20 -64
  509. transformers/quantizers/quantizer_quark.py +36 -17
  510. transformers/quantizers/quantizer_spqr.py +1 -4
  511. transformers/quantizers/quantizer_torchao.py +23 -202
  512. transformers/quantizers/quantizer_vptq.py +8 -22
  513. transformers/quantizers/quantizers_utils.py +20 -0
  514. transformers/testing_utils.py +297 -36
  515. transformers/tokenization_mistral_common.py +4 -0
  516. transformers/tokenization_utils_base.py +113 -222
  517. transformers/tokenization_utils_tokenizers.py +168 -107
  518. transformers/trainer.py +28 -31
  519. transformers/trainer_jit_checkpoint.py +126 -0
  520. transformers/trainer_utils.py +1 -1
  521. transformers/training_args.py +66 -28
  522. transformers/utils/__init__.py +3 -4
  523. transformers/utils/auto_docstring.py +1 -0
  524. transformers/utils/generic.py +27 -1
  525. transformers/utils/hub.py +5 -15
  526. transformers/utils/import_utils.py +61 -16
  527. transformers/utils/kernel_config.py +4 -2
  528. transformers/utils/loading_report.py +19 -10
  529. transformers/utils/quantization_config.py +75 -242
  530. transformers/video_processing_utils.py +1 -2
  531. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
  532. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
  533. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
  534. transformers/kernels/__init__.py +0 -0
  535. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  536. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  537. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
  538. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  539. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,7 @@ import torch.nn as nn
13
13
  from ...activations import ACT2FN
14
14
  from ...cache_utils import Cache, DynamicCache
15
15
  from ...generation import GenerationMixin
16
- from ...integrations import use_kernel_func_from_hub
16
+ from ...integrations import use_kernel_func_from_hub, use_kernelized_func
17
17
  from ...masking_utils import create_causal_mask
18
18
  from ...modeling_layers import (
19
19
  GenericForSequenceClassification,
@@ -25,7 +25,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
25
25
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
26
26
  from ...processing_utils import Unpack
27
27
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
28
- from ...utils.generic import check_model_inputs
28
+ from ...utils.generic import check_model_inputs, maybe_autocast
29
29
  from .configuration_phi import PhiConfig
30
30
 
31
31
 
@@ -90,7 +90,7 @@ class PhiRotaryEmbedding(nn.Module):
90
90
  position_ids_expanded = position_ids[:, None, :].float()
91
91
 
92
92
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
93
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
93
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
94
94
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
95
95
  emb = torch.cat((freqs, freqs), dim=-1)
96
96
  cos = emb.cos() * self.attention_scaling
@@ -172,6 +172,7 @@ def eager_attention_forward(
172
172
  return attn_output, attn_weights
173
173
 
174
174
 
175
+ @use_kernelized_func(apply_rotary_pos_emb)
175
176
  class PhiAttention(nn.Module):
176
177
  """Multi-headed attention from 'Attention Is All You Need' paper"""
177
178
 
@@ -187,7 +188,6 @@ class PhiAttention(nn.Module):
187
188
  self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
188
189
  self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
189
190
  self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
190
- self.rotary_fn = apply_rotary_pos_emb
191
191
  self.dense = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=True)
192
192
  self.rotary_ndims = int(self.head_dim * config.rope_parameters["partial_rotary_factor"])
193
193
  self.qk_layernorm = config.qk_layernorm
@@ -206,7 +206,6 @@ class PhiAttention(nn.Module):
206
206
  attention_mask: Optional[torch.Tensor],
207
207
  past_key_values: Optional[Cache] = None,
208
208
  cache_position: Optional[torch.LongTensor] = None,
209
- position_ids: Optional[torch.LongTensor] = None,
210
209
  **kwargs,
211
210
  ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
212
211
  input_shape = hidden_states.shape[:-1]
@@ -92,7 +92,6 @@ class PhiAttention(LlamaAttention):
92
92
  attention_mask: Optional[torch.Tensor],
93
93
  past_key_values: Optional[Cache] = None,
94
94
  cache_position: Optional[torch.LongTensor] = None,
95
- position_ids: Optional[torch.LongTensor] = None,
96
95
  **kwargs,
97
96
  ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
98
97
  input_shape = hidden_states.shape[:-1]
@@ -44,6 +44,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
44
44
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
45
45
  from ...processing_utils import Unpack
46
46
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
47
+ from ...utils.generic import maybe_autocast
47
48
  from .configuration_phi3 import Phi3Config
48
49
 
49
50
 
@@ -123,7 +124,7 @@ class Phi3RotaryEmbedding(nn.Module):
123
124
  position_ids_expanded = position_ids[:, None, :].float()
124
125
 
125
126
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
126
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
127
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
127
128
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
128
129
  emb = torch.cat((freqs, freqs), dim=-1)
129
130
  cos = emb.cos() * self.attention_scaling
@@ -47,7 +47,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
47
47
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
48
48
  from ...processing_utils import Unpack
49
49
  from ...utils import auto_docstring, can_return_tuple, torch_int
50
- from ...utils.generic import TransformersKwargs, check_model_inputs
50
+ from ...utils.generic import TransformersKwargs, check_model_inputs, maybe_autocast
51
51
  from .configuration_phi4_multimodal import Phi4MultimodalAudioConfig, Phi4MultimodalConfig, Phi4MultimodalVisionConfig
52
52
 
53
53
 
@@ -602,7 +602,7 @@ class Phi4MultimodalImageEmbedding(nn.Module):
602
602
 
603
603
  # Temporarily disable autocast to avoid issue on bf16 tensors
604
604
  # Ref: https://github.com/pytorch/pytorch/issues/132715
605
- with torch.autocast(device_type=inputs_embeds.device.type, enabled=False):
605
+ with maybe_autocast(device_type=inputs_embeds.device.type, enabled=False):
606
606
  image_embeds = inputs_embeds.index_put(
607
607
  indices=positions_tuple, values=merged_img_set_tensor, accumulate=False
608
608
  )
@@ -1014,7 +1014,7 @@ class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
1014
1014
  pad_mask = pad_mask & enc_streaming_mask
1015
1015
  return pad_mask
1016
1016
 
1017
- def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor]):
1017
+ def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor], **kwargs):
1018
1018
  hidden_states = self.encoder_embedding(hidden_states)
1019
1019
  hidden_states, hs_mask, mask = self.forward_embeddings(hidden_states, mask)
1020
1020
 
@@ -1116,7 +1116,7 @@ class Phi4MultimodalAudioEmbedding(nn.Module):
1116
1116
  merged_audio_embeds = merged_audio_embeds.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
1117
1117
  # Temporarily disable autocast to avoid issue on bf16 tensors
1118
1118
  # Ref: https://github.com/pytorch/pytorch/issues/132715
1119
- with torch.autocast(device_type=inputs_embeds.device.type, enabled=False):
1119
+ with maybe_autocast(device_type=inputs_embeds.device.type, enabled=False):
1120
1120
  audio_embeds = inputs_embeds.index_put(
1121
1121
  indices=positions_tuple, values=merged_audio_embeds, accumulate=False
1122
1122
  )
@@ -1500,7 +1500,7 @@ class Phi4MultimodalRotaryEmbedding(nn.Module):
1500
1500
  position_ids_expanded = position_ids[:, None, :].float()
1501
1501
 
1502
1502
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
1503
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
1503
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
1504
1504
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
1505
1505
  emb = torch.cat((freqs, freqs), dim=-1)
1506
1506
  cos = emb.cos() * self.attention_scaling
@@ -37,7 +37,7 @@ from ...modeling_rope_utils import RopeParameters
37
37
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
38
38
  from ...processing_utils import Unpack
39
39
  from ...utils import auto_docstring, logging
40
- from ...utils.generic import TransformersKwargs, check_model_inputs
40
+ from ...utils.generic import TransformersKwargs, check_model_inputs, maybe_autocast
41
41
  from ..phi3.configuration_phi3 import Phi3Config
42
42
  from ..phi3.modeling_phi3 import (
43
43
  Phi3DecoderLayer,
@@ -844,7 +844,7 @@ class Phi4MultimodalImageEmbedding(nn.Module):
844
844
 
845
845
  # Temporarily disable autocast to avoid issue on bf16 tensors
846
846
  # Ref: https://github.com/pytorch/pytorch/issues/132715
847
- with torch.autocast(device_type=inputs_embeds.device.type, enabled=False):
847
+ with maybe_autocast(device_type=inputs_embeds.device.type, enabled=False):
848
848
  image_embeds = inputs_embeds.index_put(
849
849
  indices=positions_tuple, values=merged_img_set_tensor, accumulate=False
850
850
  )
@@ -1205,7 +1205,7 @@ class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
1205
1205
  pad_mask = pad_mask & enc_streaming_mask
1206
1206
  return pad_mask
1207
1207
 
1208
- def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor]):
1208
+ def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor], **kwargs):
1209
1209
  hidden_states = self.encoder_embedding(hidden_states)
1210
1210
  hidden_states, hs_mask, mask = self.forward_embeddings(hidden_states, mask)
1211
1211
 
@@ -1358,7 +1358,7 @@ class Phi4MultimodalAudioEmbedding(nn.Module):
1358
1358
  merged_audio_embeds = merged_audio_embeds.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
1359
1359
  # Temporarily disable autocast to avoid issue on bf16 tensors
1360
1360
  # Ref: https://github.com/pytorch/pytorch/issues/132715
1361
- with torch.autocast(device_type=inputs_embeds.device.type, enabled=False):
1361
+ with maybe_autocast(device_type=inputs_embeds.device.type, enabled=False):
1362
1362
  audio_embeds = inputs_embeds.index_put(
1363
1363
  indices=positions_tuple, values=merged_audio_embeds, accumulate=False
1364
1364
  )
@@ -30,7 +30,7 @@ from ... import initialization as init
30
30
  from ...activations import ACT2FN
31
31
  from ...cache_utils import Cache, DynamicCache
32
32
  from ...generation import GenerationMixin
33
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
33
+ from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
34
34
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
35
35
  from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
36
36
  from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
@@ -38,7 +38,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
38
38
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
39
39
  from ...processing_utils import Unpack
40
40
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
41
- from ...utils.generic import OutputRecorder, check_model_inputs
41
+ from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
42
42
  from .configuration_phimoe import PhimoeConfig
43
43
 
44
44
 
@@ -113,7 +113,7 @@ class PhimoeRotaryEmbedding(nn.Module):
113
113
  position_ids_expanded = position_ids[:, None, :].float()
114
114
 
115
115
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
116
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
116
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
117
117
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
118
118
  emb = torch.cat((freqs, freqs), dim=-1)
119
119
  cos = emb.cos() * mscale
@@ -194,6 +194,7 @@ def eager_attention_forward(
194
194
  return attn_output, attn_weights
195
195
 
196
196
 
197
+ @use_kernelized_func(apply_rotary_pos_emb)
197
198
  class PhimoeAttention(nn.Module):
198
199
  """Multi-headed attention from 'Attention Is All You Need' paper"""
199
200
 
@@ -219,7 +220,6 @@ class PhimoeAttention(nn.Module):
219
220
  self.o_proj = nn.Linear(
220
221
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
221
222
  )
222
- self.rotary_fn = apply_rotary_pos_emb
223
223
 
224
224
  def forward(
225
225
  self,
@@ -24,7 +24,7 @@ from ...modeling_layers import (
24
24
  GenericForSequenceClassification,
25
25
  )
26
26
  from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
27
- from ...utils.generic import OutputRecorder
27
+ from ...utils.generic import OutputRecorder, maybe_autocast
28
28
  from ..llama.modeling_llama import LlamaAttention
29
29
  from ..mixtral.modeling_mixtral import (
30
30
  MixtralDecoderLayer,
@@ -74,7 +74,7 @@ class PhimoeRotaryEmbedding(MixtralRotaryEmbedding):
74
74
  position_ids_expanded = position_ids[:, None, :].float()
75
75
 
76
76
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
77
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
77
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
78
78
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
79
79
  emb = torch.cat((freqs, freqs), dim=-1)
80
80
  cos = emb.cos() * mscale
@@ -481,6 +481,7 @@ class Pix2StructVisionModel(Pix2StructPreTrainedModel):
481
481
  output_attentions: Optional[bool] = None,
482
482
  output_hidden_states: Optional[bool] = None,
483
483
  return_dict: Optional[bool] = None,
484
+ **kwargs,
484
485
  ) -> Union[tuple, BaseModelOutputWithPooling]:
485
486
  r"""
486
487
  flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
@@ -1359,6 +1360,7 @@ class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel, GenerationMi
1359
1360
  output_hidden_states: Optional[bool] = None,
1360
1361
  return_dict: Optional[bool] = None,
1361
1362
  cache_position: Optional[torch.LongTensor] = None,
1363
+ **kwargs,
1362
1364
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
1363
1365
  r"""
1364
1366
  flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
@@ -28,6 +28,7 @@ from ...modeling_rope_utils import dynamic_rope_update
28
28
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
29
29
  from ...processing_utils import Unpack
30
30
  from ...utils import auto_docstring, can_return_tuple, logging
31
+ from ...utils.generic import maybe_autocast
31
32
  from .configuration_pixtral import PixtralVisionConfig
32
33
 
33
34
 
@@ -125,7 +126,7 @@ class PixtralRotaryEmbedding(nn.Module):
125
126
  def forward(self, x, position_ids):
126
127
  freqs = self.inv_freq[position_ids]
127
128
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
128
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
129
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
129
130
  emb = freqs
130
131
  cos = emb.cos()
131
132
  sin = emb.sin()
@@ -366,6 +366,7 @@ class PLBartEncoder(PLBartPreTrainedModel):
366
366
  output_attentions: Optional[bool] = None,
367
367
  output_hidden_states: Optional[bool] = None,
368
368
  return_dict: Optional[bool] = None,
369
+ **kwargs,
369
370
  ) -> Union[tuple, BaseModelOutput]:
370
371
  r"""
371
372
  Args:
@@ -621,6 +622,7 @@ class PLBartDecoder(PLBartPreTrainedModel):
621
622
  output_hidden_states: Optional[bool] = None,
622
623
  return_dict: Optional[bool] = None,
623
624
  cache_position: Optional[torch.LongTensor] = None,
625
+ **kwargs,
624
626
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
625
627
  r"""
626
628
  Args:
@@ -867,6 +869,7 @@ class PLBartModel(PLBartPreTrainedModel):
867
869
  output_hidden_states: Optional[bool] = None,
868
870
  return_dict: Optional[bool] = None,
869
871
  cache_position: Optional[torch.LongTensor] = None,
872
+ **kwargs,
870
873
  ) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
871
874
  r"""
872
875
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1002,6 +1005,7 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel, GenerationMixin):
1002
1005
  output_hidden_states: Optional[bool] = None,
1003
1006
  return_dict: Optional[bool] = None,
1004
1007
  cache_position: Optional[torch.LongTensor] = None,
1008
+ **kwargs,
1005
1009
  ) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
1006
1010
  r"""
1007
1011
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1159,6 +1163,7 @@ class PLBartForSequenceClassification(PLBartPreTrainedModel):
1159
1163
  output_hidden_states: Optional[bool] = None,
1160
1164
  return_dict: Optional[bool] = None,
1161
1165
  cache_position: Optional[torch.LongTensor] = None,
1166
+ **kwargs,
1162
1167
  ) -> Union[tuple, Seq2SeqSequenceClassifierOutput]:
1163
1168
  r"""
1164
1169
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1316,6 +1321,7 @@ class PLBartForCausalLM(PLBartPreTrainedModel, GenerationMixin):
1316
1321
  return_dict: Optional[bool] = None,
1317
1322
  cache_position: Optional[torch.LongTensor] = None,
1318
1323
  logits_to_keep: Union[int, torch.Tensor] = 0,
1324
+ **kwargs,
1319
1325
  ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
1320
1326
  r"""
1321
1327
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -108,6 +108,7 @@ class PLBartModel(PLBartPreTrainedModel):
108
108
  output_hidden_states: Optional[bool] = None,
109
109
  return_dict: Optional[bool] = None,
110
110
  cache_position: Optional[torch.LongTensor] = None,
111
+ **kwargs,
111
112
  ) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
112
113
  r"""
113
114
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -243,6 +244,7 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel, GenerationMixin):
243
244
  output_hidden_states: Optional[bool] = None,
244
245
  return_dict: Optional[bool] = None,
245
246
  cache_position: Optional[torch.LongTensor] = None,
247
+ **kwargs,
246
248
  ) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
247
249
  r"""
248
250
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -125,7 +125,6 @@ class PLBartTokenizer(SentencePieceBackend):
125
125
  pad_token="<pad>",
126
126
  mask_token="<mask>",
127
127
  language_codes="base",
128
- tokenizer_file=None,
129
128
  src_lang=None,
130
129
  tgt_lang=None,
131
130
  sp_model_kwargs: Optional[dict[str, Any]] = None,
@@ -171,7 +170,6 @@ class PLBartTokenizer(SentencePieceBackend):
171
170
  cls_token=cls_token,
172
171
  pad_token=pad_token,
173
172
  mask_token=mask_token,
174
- tokenizer_file=tokenizer_file,
175
173
  src_lang=src_lang,
176
174
  tgt_lang=tgt_lang,
177
175
  additional_special_tokens=_additional_special_tokens,
@@ -276,6 +276,7 @@ class PoolFormerModel(PoolFormerPreTrainedModel):
276
276
  pixel_values: Optional[torch.FloatTensor] = None,
277
277
  output_hidden_states: Optional[bool] = None,
278
278
  return_dict: Optional[bool] = None,
279
+ **kwargs,
279
280
  ) -> Union[tuple, BaseModelOutputWithNoAttention]:
280
281
  output_hidden_states = (
281
282
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -339,6 +340,7 @@ class PoolFormerForImageClassification(PoolFormerPreTrainedModel):
339
340
  labels: Optional[torch.LongTensor] = None,
340
341
  output_hidden_states: Optional[bool] = None,
341
342
  return_dict: Optional[bool] = None,
343
+ **kwargs,
342
344
  ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
343
345
  r"""
344
346
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -644,6 +644,7 @@ class Pop2PianoStack(Pop2PianoPreTrainedModel):
644
644
  output_hidden_states=None,
645
645
  return_dict=None,
646
646
  cache_position=None,
647
+ **kwargs,
647
648
  ):
648
649
  use_cache = use_cache if use_cache is not None else self.config.use_cache
649
650
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1051,6 +1052,7 @@ class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel, GenerationMixi
1051
1052
  output_hidden_states: Optional[bool] = None,
1052
1053
  return_dict: Optional[bool] = None,
1053
1054
  cache_position: Optional[torch.LongTensor] = None,
1055
+ **kwargs,
1054
1056
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
1055
1057
  r"""
1056
1058
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -393,6 +393,7 @@ class PromptDepthAnythingForDepthEstimation(PromptDepthAnythingPreTrainedModel):
393
393
  output_attentions: Optional[bool] = None,
394
394
  output_hidden_states: Optional[bool] = None,
395
395
  return_dict: Optional[bool] = None,
396
+ **kwargs,
396
397
  ) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]:
397
398
  r"""
398
399
  prompt_depth (`torch.FloatTensor` of shape `(batch_size, 1, height, width)`, *optional*):
@@ -236,6 +236,7 @@ class PromptDepthAnythingForDepthEstimation(DepthAnythingForDepthEstimation):
236
236
  output_attentions: Optional[bool] = None,
237
237
  output_hidden_states: Optional[bool] = None,
238
238
  return_dict: Optional[bool] = None,
239
+ **kwargs,
239
240
  ) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]:
240
241
  r"""
241
242
  prompt_depth (`torch.FloatTensor` of shape `(batch_size, 1, height, width)`, *optional*):
@@ -993,6 +993,7 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
993
993
  output_attentions: Optional[bool] = None,
994
994
  output_hidden_states: Optional[bool] = None,
995
995
  return_dict: Optional[bool] = None,
996
+ **kwargs,
996
997
  ) -> Union[tuple, BaseModelOutput]:
997
998
  r"""
998
999
  Example:
@@ -1113,6 +1114,7 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
1113
1114
  output_hidden_states: Optional[bool] = None,
1114
1115
  return_dict: Optional[bool] = None,
1115
1116
  cache_position: Optional[torch.Tensor] = None,
1117
+ **kwargs,
1116
1118
  ) -> Union[tuple, ProphetNetDecoderModelOutput]:
1117
1119
  r"""
1118
1120
  Example:
@@ -1416,6 +1418,7 @@ class ProphetNetModel(ProphetNetPreTrainedModel):
1416
1418
  output_hidden_states: Optional[bool] = None,
1417
1419
  return_dict: Optional[bool] = None,
1418
1420
  cache_position: Optional[torch.Tensor] = None,
1421
+ **kwargs,
1419
1422
  ) -> Union[tuple, ProphetNetSeq2SeqModelOutput]:
1420
1423
  r"""
1421
1424
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -458,6 +458,7 @@ class PvtModel(PvtPreTrainedModel):
458
458
  output_attentions: Optional[bool] = None,
459
459
  output_hidden_states: Optional[bool] = None,
460
460
  return_dict: Optional[bool] = None,
461
+ **kwargs,
461
462
  ) -> Union[tuple, BaseModelOutput]:
462
463
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
463
464
  output_hidden_states = (
@@ -512,6 +513,7 @@ class PvtForImageClassification(PvtPreTrainedModel):
512
513
  output_attentions: Optional[bool] = None,
513
514
  output_hidden_states: Optional[bool] = None,
514
515
  return_dict: Optional[bool] = None,
516
+ **kwargs,
515
517
  ) -> Union[tuple, ImageClassifierOutput]:
516
518
  r"""
517
519
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -406,6 +406,7 @@ class PvtV2Model(PvtV2PreTrainedModel):
406
406
  output_attentions: Optional[bool] = None,
407
407
  output_hidden_states: Optional[bool] = None,
408
408
  return_dict: Optional[bool] = None,
409
+ **kwargs,
409
410
  ) -> Union[tuple, BaseModelOutput]:
410
411
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
411
412
  output_hidden_states = (
@@ -460,6 +461,7 @@ class PvtV2ForImageClassification(PvtV2PreTrainedModel):
460
461
  output_attentions: Optional[bool] = None,
461
462
  output_hidden_states: Optional[bool] = None,
462
463
  return_dict: Optional[bool] = None,
464
+ **kwargs,
463
465
  ) -> Union[tuple, ImageClassifierOutput]:
464
466
  r"""
465
467
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -523,6 +525,7 @@ class PvtV2Backbone(PvtV2Model, BackboneMixin):
523
525
  output_attentions: Optional[bool] = None,
524
526
  output_hidden_states: Optional[bool] = None,
525
527
  return_dict: Optional[bool] = None,
528
+ **kwargs,
526
529
  ) -> BackboneOutput:
527
530
  r"""
528
531
  Examples:
@@ -13,7 +13,7 @@ from torch import nn
13
13
  from ...activations import ACT2FN
14
14
  from ...cache_utils import Cache, DynamicCache
15
15
  from ...generation import GenerationMixin
16
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
16
+ from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
17
17
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
18
18
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
19
19
  from ...modeling_layers import (
@@ -27,7 +27,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
27
27
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
28
28
  from ...processing_utils import Unpack
29
29
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
30
- from ...utils.generic import check_model_inputs
30
+ from ...utils.generic import check_model_inputs, maybe_autocast
31
31
  from .configuration_qwen2 import Qwen2Config
32
32
 
33
33
 
@@ -103,7 +103,7 @@ class Qwen2RotaryEmbedding(nn.Module):
103
103
  position_ids_expanded = position_ids[:, None, :].float()
104
104
 
105
105
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
106
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
106
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
107
107
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
108
108
  emb = torch.cat((freqs, freqs), dim=-1)
109
109
  cos = emb.cos() * self.attention_scaling
@@ -185,6 +185,7 @@ def eager_attention_forward(
185
185
  return attn_output, attn_weights
186
186
 
187
187
 
188
+ @use_kernelized_func(apply_rotary_pos_emb)
188
189
  class Qwen2Attention(nn.Module):
189
190
  """Multi-headed attention from 'Attention Is All You Need' paper"""
190
191
 
@@ -202,7 +203,6 @@ class Qwen2Attention(nn.Module):
202
203
  self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
203
204
  self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
204
205
  self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
205
- self.rotary_fn = apply_rotary_pos_emb
206
206
  self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None
207
207
 
208
208
  def forward(
@@ -14,10 +14,11 @@
14
14
  # limitations under the License.
15
15
  """Tokenization classes for Qwen2."""
16
16
 
17
+ from typing import Optional, Union
18
+
17
19
  from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers
18
20
  from tokenizers.models import BPE
19
21
 
20
- from ...tokenization_utils_base import generate_merges
21
22
  from ...tokenization_utils_tokenizers import TokenizersBackend
22
23
  from ...utils import logging
23
24
 
@@ -38,33 +39,30 @@ PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p
38
39
  class Qwen2Tokenizer(TokenizersBackend):
39
40
  vocab_files_names = VOCAB_FILES_NAMES
40
41
  model_input_names = ["input_ids", "attention_mask"]
41
- slow_tokenizer_class = None
42
+ model = BPE
42
43
 
43
44
  def __init__(
44
45
  self,
46
+ vocab: Optional[Union[str, dict[str, int]]] = None,
47
+ merges: Optional[Union[str, list[str]]] = None,
45
48
  vocab_file=None,
46
49
  merges_file=None,
47
- unk_token="<|endoftext|>",
50
+ unk_token: str = "<|endoftext|>",
48
51
  bos_token=None,
49
- eos_token="<|endoftext|>",
50
- pad_token="<|endoftext|>",
52
+ eos_token: str = "<|endoftext|>",
53
+ pad_token: str = "<|endoftext|>",
51
54
  add_prefix_space=None,
52
- vocab=None,
53
- merges=None,
54
55
  **kwargs,
55
56
  ):
56
57
  self.add_prefix_space = add_prefix_space if add_prefix_space is not None else False
57
-
58
- if vocab is not None:
59
- self._vocab = (
60
- {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
61
- )
62
- else:
63
- self._vocab = {
58
+ self._vocab = (
59
+ vocab
60
+ if vocab is not None
61
+ else {
64
62
  "<|endoftext|>": 0,
65
63
  }
66
- self._merges = merges if merges is not None else generate_merges(self._vocab)
67
-
64
+ )
65
+ self._merges = merges or []
68
66
  self._tokenizer = Tokenizer(
69
67
  BPE(
70
68
  vocab=self._vocab,
@@ -92,12 +90,10 @@ class Qwen2Tokenizer(TokenizersBackend):
92
90
  ),
93
91
  ]
94
92
  )
95
- tokenizer_object = self._tokenizer
96
93
 
97
94
  super().__init__(
98
95
  vocab_file=vocab_file,
99
96
  merges_file=merges_file,
100
- tokenizer_object=tokenizer_object,
101
97
  unk_token=unk_token,
102
98
  bos_token=bos_token,
103
99
  eos_token=eos_token,
@@ -365,7 +365,7 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig):
365
365
  self.rope_parameters = rope_parameters
366
366
  super().__init__(
367
367
  tie_word_embeddings=tie_word_embeddings,
368
- ignore_keys_at_rope_validation={"mrope"},
368
+ ignore_keys_at_rope_validation={"mrope_section"},
369
369
  **kwargs,
370
370
  )
371
371
 
@@ -713,7 +713,9 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig):
713
713
  layer_type_validation(self.layer_types, self.num_hidden_layers)
714
714
 
715
715
  self.rope_parameters = rope_parameters
716
- super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs)
716
+ super().__init__(
717
+ tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
718
+ )
717
719
 
718
720
 
719
721
  class Qwen2_5OmniDiTConfig(PreTrainedConfig):