transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. transformers/__init__.py +30 -3
  2. transformers/cli/serve.py +47 -17
  3. transformers/conversion_mapping.py +15 -2
  4. transformers/convert_slow_tokenizer.py +225 -10
  5. transformers/core_model_loading.py +196 -135
  6. transformers/data/data_collator.py +12 -4
  7. transformers/dependency_versions_table.py +1 -2
  8. transformers/dynamic_module_utils.py +1 -2
  9. transformers/feature_extraction_utils.py +1 -2
  10. transformers/file_utils.py +0 -1
  11. transformers/generation/__init__.py +11 -1
  12. transformers/generation/configuration_utils.py +3 -2
  13. transformers/generation/continuous_batching/__init__.py +4 -0
  14. transformers/generation/continuous_batching/continuous_api.py +134 -79
  15. transformers/image_processing_base.py +1 -2
  16. transformers/integrations/__init__.py +4 -2
  17. transformers/integrations/accelerate.py +15 -3
  18. transformers/integrations/aqlm.py +38 -66
  19. transformers/integrations/awq.py +48 -514
  20. transformers/integrations/bitnet.py +45 -100
  21. transformers/integrations/bitsandbytes.py +79 -191
  22. transformers/integrations/deepspeed.py +1 -0
  23. transformers/integrations/eetq.py +84 -79
  24. transformers/integrations/fbgemm_fp8.py +191 -145
  25. transformers/integrations/finegrained_fp8.py +236 -193
  26. transformers/integrations/fp_quant.py +92 -0
  27. transformers/integrations/ggml.py +11 -1
  28. transformers/integrations/higgs.py +40 -62
  29. transformers/integrations/hub_kernels.py +42 -3
  30. transformers/integrations/integration_utils.py +10 -0
  31. transformers/integrations/mxfp4.py +25 -65
  32. transformers/integrations/peft.py +7 -29
  33. transformers/integrations/quanto.py +73 -55
  34. transformers/integrations/quark.py +55 -0
  35. transformers/integrations/spqr.py +44 -90
  36. transformers/integrations/torchao.py +32 -38
  37. transformers/integrations/vptq.py +42 -59
  38. transformers/modelcard.py +1 -2
  39. transformers/modeling_gguf_pytorch_utils.py +8 -0
  40. transformers/modeling_rope_utils.py +30 -6
  41. transformers/modeling_utils.py +116 -112
  42. transformers/models/__init__.py +3 -0
  43. transformers/models/afmoe/modeling_afmoe.py +4 -4
  44. transformers/models/albert/tokenization_albert.py +6 -12
  45. transformers/models/align/modeling_align.py +2 -0
  46. transformers/models/altclip/modeling_altclip.py +4 -0
  47. transformers/models/apertus/modeling_apertus.py +4 -4
  48. transformers/models/arcee/modeling_arcee.py +4 -4
  49. transformers/models/aria/modeling_aria.py +4 -4
  50. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  51. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  52. transformers/models/auto/configuration_auto.py +11 -0
  53. transformers/models/auto/feature_extraction_auto.py +2 -0
  54. transformers/models/auto/image_processing_auto.py +1 -0
  55. transformers/models/auto/modeling_auto.py +6 -0
  56. transformers/models/auto/processing_auto.py +18 -10
  57. transformers/models/auto/tokenization_auto.py +74 -472
  58. transformers/models/autoformer/modeling_autoformer.py +4 -0
  59. transformers/models/bamba/modeling_bamba.py +4 -3
  60. transformers/models/bark/modeling_bark.py +2 -0
  61. transformers/models/bart/modeling_bart.py +7 -0
  62. transformers/models/barthez/tokenization_barthez.py +5 -10
  63. transformers/models/beit/modeling_beit.py +6 -1
  64. transformers/models/bert/tokenization_bert.py +8 -21
  65. transformers/models/big_bird/modeling_big_bird.py +6 -0
  66. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  67. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
  68. transformers/models/biogpt/modeling_biogpt.py +2 -0
  69. transformers/models/biogpt/modular_biogpt.py +2 -0
  70. transformers/models/bit/modeling_bit.py +11 -2
  71. transformers/models/bitnet/modeling_bitnet.py +4 -4
  72. transformers/models/blenderbot/modeling_blenderbot.py +5 -0
  73. transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
  74. transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
  75. transformers/models/blip/modeling_blip_text.py +2 -0
  76. transformers/models/blip_2/modeling_blip_2.py +2 -1
  77. transformers/models/bloom/modeling_bloom.py +4 -0
  78. transformers/models/blt/modeling_blt.py +2 -2
  79. transformers/models/blt/modular_blt.py +2 -2
  80. transformers/models/bridgetower/modeling_bridgetower.py +5 -1
  81. transformers/models/bros/modeling_bros.py +4 -0
  82. transformers/models/camembert/tokenization_camembert.py +8 -12
  83. transformers/models/canine/modeling_canine.py +5 -0
  84. transformers/models/chameleon/modeling_chameleon.py +2 -1
  85. transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
  86. transformers/models/clap/modeling_clap.py +5 -0
  87. transformers/models/clip/tokenization_clip.py +22 -44
  88. transformers/models/clipseg/modeling_clipseg.py +5 -0
  89. transformers/models/clvp/modeling_clvp.py +5 -0
  90. transformers/models/clvp/tokenization_clvp.py +1 -63
  91. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  92. transformers/models/codegen/tokenization_codegen.py +14 -43
  93. transformers/models/cohere/modeling_cohere.py +4 -3
  94. transformers/models/cohere/modular_cohere.py +2 -1
  95. transformers/models/cohere/tokenization_cohere.py +12 -42
  96. transformers/models/cohere2/modeling_cohere2.py +7 -6
  97. transformers/models/cohere2/modular_cohere2.py +5 -5
  98. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
  99. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  100. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  101. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  102. transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
  103. transformers/models/convbert/modeling_convbert.py +6 -0
  104. transformers/models/convnext/modeling_convnext.py +2 -4
  105. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  106. transformers/models/csm/modeling_csm.py +4 -3
  107. transformers/models/ctrl/modeling_ctrl.py +1 -0
  108. transformers/models/cvt/modeling_cvt.py +2 -0
  109. transformers/models/cwm/modeling_cwm.py +4 -4
  110. transformers/models/d_fine/modeling_d_fine.py +2 -0
  111. transformers/models/d_fine/modular_d_fine.py +1 -0
  112. transformers/models/dab_detr/modeling_dab_detr.py +4 -0
  113. transformers/models/dac/modeling_dac.py +2 -2
  114. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  115. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  116. transformers/models/dbrx/modeling_dbrx.py +2 -2
  117. transformers/models/deberta/modeling_deberta.py +5 -0
  118. transformers/models/deberta/tokenization_deberta.py +11 -20
  119. transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
  120. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  121. transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
  122. transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
  123. transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
  124. transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
  125. transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
  126. transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
  127. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  128. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  129. transformers/models/detr/modeling_detr.py +5 -0
  130. transformers/models/dia/modeling_dia.py +4 -3
  131. transformers/models/dia/modular_dia.py +0 -1
  132. transformers/models/diffllama/modeling_diffllama.py +2 -2
  133. transformers/models/dinat/modeling_dinat.py +3 -0
  134. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  135. transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
  136. transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
  137. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  138. transformers/models/doge/modeling_doge.py +2 -3
  139. transformers/models/doge/modular_doge.py +0 -1
  140. transformers/models/donut/modeling_donut_swin.py +2 -0
  141. transformers/models/dots1/modeling_dots1.py +10 -7
  142. transformers/models/dots1/modular_dots1.py +5 -3
  143. transformers/models/dpr/modeling_dpr.py +5 -0
  144. transformers/models/dpr/tokenization_dpr.py +12 -0
  145. transformers/models/edgetam/modeling_edgetam.py +1 -1
  146. transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
  147. transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
  148. transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
  149. transformers/models/efficientnet/modeling_efficientnet.py +2 -0
  150. transformers/models/emu3/modeling_emu3.py +4 -4
  151. transformers/models/eomt/image_processing_eomt.py +13 -1
  152. transformers/models/eomt/image_processing_eomt_fast.py +14 -2
  153. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  154. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  155. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
  156. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
  157. transformers/models/esm/modeling_esmfold.py +5 -4
  158. transformers/models/evolla/modeling_evolla.py +4 -4
  159. transformers/models/exaone4/modeling_exaone4.py +2 -2
  160. transformers/models/exaone4/modular_exaone4.py +0 -1
  161. transformers/models/falcon/modeling_falcon.py +6 -1
  162. transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
  163. transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
  164. transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
  165. transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
  166. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  167. transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
  168. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  169. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
  170. transformers/models/flaubert/modeling_flaubert.py +7 -0
  171. transformers/models/flava/modeling_flava.py +6 -1
  172. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
  173. transformers/models/florence2/modeling_florence2.py +2 -1
  174. transformers/models/florence2/modular_florence2.py +2 -1
  175. transformers/models/fnet/modeling_fnet.py +7 -0
  176. transformers/models/focalnet/modeling_focalnet.py +4 -0
  177. transformers/models/fsmt/modeling_fsmt.py +2 -0
  178. transformers/models/funnel/modeling_funnel.py +8 -0
  179. transformers/models/funnel/tokenization_funnel.py +17 -24
  180. transformers/models/fuyu/processing_fuyu.py +3 -3
  181. transformers/models/gemma/modeling_gemma.py +4 -4
  182. transformers/models/gemma/tokenization_gemma.py +10 -27
  183. transformers/models/gemma2/modeling_gemma2.py +4 -4
  184. transformers/models/gemma2/modular_gemma2.py +2 -1
  185. transformers/models/gemma3/modeling_gemma3.py +14 -84
  186. transformers/models/gemma3/modular_gemma3.py +12 -81
  187. transformers/models/gemma3n/modeling_gemma3n.py +18 -209
  188. transformers/models/gemma3n/modular_gemma3n.py +17 -59
  189. transformers/models/git/modeling_git.py +2 -0
  190. transformers/models/glm/modeling_glm.py +4 -4
  191. transformers/models/glm4/modeling_glm4.py +4 -4
  192. transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
  193. transformers/models/glm4v/configuration_glm4v.py +3 -1
  194. transformers/models/glm4v/modeling_glm4v.py +3 -3
  195. transformers/models/glm4v/modular_glm4v.py +6 -4
  196. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  197. transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
  198. transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
  199. transformers/models/glpn/modeling_glpn.py +2 -0
  200. transformers/models/gpt2/modeling_gpt2.py +5 -1
  201. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  202. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
  203. transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
  204. transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
  205. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  206. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  207. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
  208. transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
  209. transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
  210. transformers/models/gptj/modeling_gptj.py +3 -0
  211. transformers/models/granite/modeling_granite.py +4 -4
  212. transformers/models/granitemoe/modeling_granitemoe.py +4 -6
  213. transformers/models/granitemoe/modular_granitemoe.py +0 -2
  214. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
  215. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
  216. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
  217. transformers/models/groupvit/modeling_groupvit.py +3 -0
  218. transformers/models/helium/modeling_helium.py +4 -3
  219. transformers/models/herbert/tokenization_herbert.py +9 -25
  220. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
  221. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
  222. transformers/models/hiera/modeling_hiera.py +4 -0
  223. transformers/models/hubert/modeling_hubert.py +3 -0
  224. transformers/models/hubert/modular_hubert.py +1 -0
  225. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
  226. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
  227. transformers/models/ibert/modeling_ibert.py +6 -0
  228. transformers/models/idefics/modeling_idefics.py +5 -21
  229. transformers/models/imagegpt/modeling_imagegpt.py +2 -1
  230. transformers/models/informer/modeling_informer.py +4 -0
  231. transformers/models/informer/modular_informer.py +1 -0
  232. transformers/models/internvl/modeling_internvl.py +2 -4
  233. transformers/models/internvl/modular_internvl.py +2 -4
  234. transformers/models/jamba/modeling_jamba.py +2 -2
  235. transformers/models/janus/modeling_janus.py +1 -0
  236. transformers/models/janus/modular_janus.py +1 -0
  237. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  238. transformers/models/kosmos2/modeling_kosmos2.py +1 -0
  239. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
  240. transformers/models/lasr/__init__.py +29 -0
  241. transformers/models/lasr/configuration_lasr.py +244 -0
  242. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  243. transformers/models/lasr/modeling_lasr.py +729 -0
  244. transformers/models/lasr/modular_lasr.py +569 -0
  245. transformers/models/lasr/processing_lasr.py +96 -0
  246. transformers/models/lasr/tokenization_lasr.py +186 -0
  247. transformers/models/layoutlm/modeling_layoutlm.py +5 -0
  248. transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
  249. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
  250. transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
  251. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  252. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  253. transformers/models/led/modeling_led.py +6 -0
  254. transformers/models/levit/modeling_levit.py +3 -0
  255. transformers/models/lfm2/modeling_lfm2.py +4 -5
  256. transformers/models/lfm2/modular_lfm2.py +0 -1
  257. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
  258. transformers/models/lightglue/modeling_lightglue.py +3 -1
  259. transformers/models/lightglue/modular_lightglue.py +1 -0
  260. transformers/models/lilt/modeling_lilt.py +4 -0
  261. transformers/models/llama/modeling_llama.py +4 -4
  262. transformers/models/llama/tokenization_llama.py +15 -43
  263. transformers/models/llama4/modeling_llama4.py +3 -2
  264. transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
  265. transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
  266. transformers/models/longformer/modeling_longformer.py +6 -0
  267. transformers/models/longt5/modeling_longt5.py +4 -0
  268. transformers/models/luke/modeling_luke.py +9 -0
  269. transformers/models/luke/tokenization_luke.py +11 -38
  270. transformers/models/lxmert/modeling_lxmert.py +2 -0
  271. transformers/models/m2m_100/modeling_m2m_100.py +4 -0
  272. transformers/models/mamba/modeling_mamba.py +14 -22
  273. transformers/models/marian/modeling_marian.py +5 -0
  274. transformers/models/markuplm/modeling_markuplm.py +4 -0
  275. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  276. transformers/models/mask2former/modeling_mask2former.py +2 -0
  277. transformers/models/maskformer/modeling_maskformer.py +2 -0
  278. transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
  279. transformers/models/mbart/modeling_mbart.py +7 -0
  280. transformers/models/mbart/tokenization_mbart.py +11 -52
  281. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  282. transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
  283. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  284. transformers/models/mimi/modeling_mimi.py +3 -1
  285. transformers/models/minimax/modeling_minimax.py +4 -4
  286. transformers/models/ministral/modeling_ministral.py +4 -4
  287. transformers/models/ministral3/configuration_ministral3.py +1 -1
  288. transformers/models/ministral3/modeling_ministral3.py +4 -3
  289. transformers/models/mistral/modeling_mistral.py +4 -3
  290. transformers/models/mixtral/modeling_mixtral.py +4 -4
  291. transformers/models/mllama/modeling_mllama.py +2 -2
  292. transformers/models/mluke/tokenization_mluke.py +6 -6
  293. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
  294. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  295. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  296. transformers/models/mobilevit/modeling_mobilevit.py +3 -0
  297. transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
  298. transformers/models/modernbert/modeling_modernbert.py +4 -1
  299. transformers/models/modernbert/modular_modernbert.py +2 -0
  300. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
  301. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
  302. transformers/models/moonshine/modeling_moonshine.py +4 -2
  303. transformers/models/moshi/modeling_moshi.py +5 -2
  304. transformers/models/mpnet/modeling_mpnet.py +5 -0
  305. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  306. transformers/models/mpt/modeling_mpt.py +2 -0
  307. transformers/models/mra/modeling_mra.py +6 -0
  308. transformers/models/mt5/modeling_mt5.py +7 -0
  309. transformers/models/musicgen/modeling_musicgen.py +2 -0
  310. transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
  311. transformers/models/mvp/modeling_mvp.py +7 -0
  312. transformers/models/nanochat/modeling_nanochat.py +4 -4
  313. transformers/models/nemotron/modeling_nemotron.py +4 -2
  314. transformers/models/nllb/tokenization_nllb.py +8 -22
  315. transformers/models/nougat/tokenization_nougat.py +11 -59
  316. transformers/models/nystromformer/modeling_nystromformer.py +6 -0
  317. transformers/models/olmo/modeling_olmo.py +4 -4
  318. transformers/models/olmo/modular_olmo.py +2 -2
  319. transformers/models/olmo2/modeling_olmo2.py +4 -5
  320. transformers/models/olmo2/modular_olmo2.py +0 -1
  321. transformers/models/olmo3/modeling_olmo3.py +4 -4
  322. transformers/models/olmoe/modeling_olmoe.py +4 -4
  323. transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
  324. transformers/models/oneformer/modeling_oneformer.py +4 -1
  325. transformers/models/openai/modeling_openai.py +3 -0
  326. transformers/models/openai/tokenization_openai.py +10 -46
  327. transformers/models/opt/modeling_opt.py +2 -0
  328. transformers/models/owlv2/modeling_owlv2.py +4 -0
  329. transformers/models/owlvit/modeling_owlvit.py +4 -0
  330. transformers/models/paddleocr_vl/__init__.py +32 -0
  331. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  332. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
  333. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  334. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
  335. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
  336. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  337. transformers/models/parakeet/configuration_parakeet.py +4 -6
  338. transformers/models/parakeet/modeling_parakeet.py +9 -6
  339. transformers/models/parakeet/modular_parakeet.py +2 -2
  340. transformers/models/parakeet/processing_parakeet.py +1 -0
  341. transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
  342. transformers/models/patchtst/modeling_patchtst.py +20 -2
  343. transformers/models/pegasus/modeling_pegasus.py +5 -0
  344. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  345. transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
  346. transformers/models/perceiver/modeling_perceiver.py +8 -0
  347. transformers/models/persimmon/modeling_persimmon.py +2 -1
  348. transformers/models/phi/modeling_phi.py +4 -5
  349. transformers/models/phi/modular_phi.py +0 -1
  350. transformers/models/phi3/modeling_phi3.py +2 -1
  351. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
  352. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
  353. transformers/models/phimoe/modeling_phimoe.py +4 -4
  354. transformers/models/phimoe/modular_phimoe.py +2 -2
  355. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  356. transformers/models/pixtral/modeling_pixtral.py +2 -1
  357. transformers/models/plbart/modeling_plbart.py +6 -0
  358. transformers/models/plbart/modular_plbart.py +2 -0
  359. transformers/models/plbart/tokenization_plbart.py +0 -2
  360. transformers/models/poolformer/modeling_poolformer.py +2 -0
  361. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  362. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  363. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  364. transformers/models/prophetnet/modeling_prophetnet.py +3 -0
  365. transformers/models/pvt/modeling_pvt.py +2 -0
  366. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  367. transformers/models/qwen2/modeling_qwen2.py +4 -4
  368. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  369. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  370. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
  371. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
  372. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  373. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
  374. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
  375. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
  376. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  377. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  378. transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
  379. transformers/models/qwen3/modeling_qwen3.py +4 -4
  380. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  381. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
  382. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
  383. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
  384. transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
  385. transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
  386. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
  387. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
  388. transformers/models/rag/modeling_rag.py +1 -0
  389. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
  390. transformers/models/reformer/modeling_reformer.py +4 -0
  391. transformers/models/reformer/tokenization_reformer.py +11 -28
  392. transformers/models/regnet/modeling_regnet.py +6 -1
  393. transformers/models/rembert/modeling_rembert.py +6 -0
  394. transformers/models/rembert/tokenization_rembert.py +3 -10
  395. transformers/models/resnet/modeling_resnet.py +11 -2
  396. transformers/models/roberta/tokenization_roberta.py +18 -27
  397. transformers/models/roformer/modeling_roformer.py +6 -0
  398. transformers/models/roformer/tokenization_roformer.py +77 -412
  399. transformers/models/rt_detr/modeling_rt_detr.py +2 -0
  400. transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
  401. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
  402. transformers/models/rwkv/modeling_rwkv.py +1 -0
  403. transformers/models/sam2/modeling_sam2.py +2 -2
  404. transformers/models/sam2/modular_sam2.py +2 -2
  405. transformers/models/sam2_video/modeling_sam2_video.py +1 -0
  406. transformers/models/sam2_video/modular_sam2_video.py +1 -0
  407. transformers/models/sam3/modeling_sam3.py +77 -80
  408. transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
  409. transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
  410. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
  411. transformers/models/sam3_video/modeling_sam3_video.py +1 -0
  412. transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
  413. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  414. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
  415. transformers/models/seed_oss/modeling_seed_oss.py +2 -2
  416. transformers/models/segformer/modeling_segformer.py +4 -1
  417. transformers/models/seggpt/modeling_seggpt.py +2 -0
  418. transformers/models/sew/modeling_sew.py +3 -0
  419. transformers/models/sew/modular_sew.py +1 -0
  420. transformers/models/sew_d/modeling_sew_d.py +3 -0
  421. transformers/models/siglip2/modeling_siglip2.py +4 -0
  422. transformers/models/siglip2/modular_siglip2.py +4 -0
  423. transformers/models/smollm3/modeling_smollm3.py +4 -4
  424. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  425. transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
  426. transformers/models/speecht5/modeling_speecht5.py +13 -1
  427. transformers/models/splinter/modeling_splinter.py +3 -0
  428. transformers/models/splinter/tokenization_splinter.py +9 -28
  429. transformers/models/squeezebert/modeling_squeezebert.py +6 -0
  430. transformers/models/stablelm/modeling_stablelm.py +3 -1
  431. transformers/models/starcoder2/modeling_starcoder2.py +4 -3
  432. transformers/models/superglue/modeling_superglue.py +1 -0
  433. transformers/models/superpoint/modeling_superpoint.py +1 -0
  434. transformers/models/swiftformer/modeling_swiftformer.py +2 -0
  435. transformers/models/swin/modeling_swin.py +4 -0
  436. transformers/models/swin2sr/modeling_swin2sr.py +2 -0
  437. transformers/models/swinv2/modeling_swinv2.py +4 -0
  438. transformers/models/t5/modeling_t5.py +7 -0
  439. transformers/models/t5/tokenization_t5.py +4 -8
  440. transformers/models/t5gemma/modeling_t5gemma.py +5 -5
  441. transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
  442. transformers/models/table_transformer/modeling_table_transformer.py +4 -0
  443. transformers/models/tapas/modeling_tapas.py +3 -0
  444. transformers/models/textnet/modeling_textnet.py +11 -2
  445. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  446. transformers/models/timesfm/modeling_timesfm.py +2 -0
  447. transformers/models/timesfm/modular_timesfm.py +2 -0
  448. transformers/models/timesformer/modeling_timesformer.py +2 -0
  449. transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
  450. transformers/models/trocr/modeling_trocr.py +2 -0
  451. transformers/models/tvp/modeling_tvp.py +2 -0
  452. transformers/models/udop/modeling_udop.py +4 -0
  453. transformers/models/udop/tokenization_udop.py +5 -13
  454. transformers/models/umt5/modeling_umt5.py +7 -0
  455. transformers/models/unispeech/modeling_unispeech.py +4 -0
  456. transformers/models/unispeech/modular_unispeech.py +2 -0
  457. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  458. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  459. transformers/models/univnet/modeling_univnet.py +1 -0
  460. transformers/models/upernet/modeling_upernet.py +1 -0
  461. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  462. transformers/models/vilt/modeling_vilt.py +6 -0
  463. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  464. transformers/models/visual_bert/modeling_visual_bert.py +6 -0
  465. transformers/models/vitdet/modeling_vitdet.py +2 -0
  466. transformers/models/vitmatte/modeling_vitmatte.py +1 -0
  467. transformers/models/vits/modeling_vits.py +1 -0
  468. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  469. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  470. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
  471. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
  472. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
  473. transformers/models/wavlm/modeling_wavlm.py +5 -0
  474. transformers/models/whisper/modeling_whisper.py +6 -0
  475. transformers/models/whisper/tokenization_whisper.py +4 -15
  476. transformers/models/x_clip/modeling_x_clip.py +3 -0
  477. transformers/models/xglm/modeling_xglm.py +1 -0
  478. transformers/models/xglm/tokenization_xglm.py +4 -9
  479. transformers/models/xlm/modeling_xlm.py +5 -0
  480. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  481. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  482. transformers/models/yoso/modeling_yoso.py +6 -0
  483. transformers/models/zamba/modeling_zamba.py +2 -0
  484. transformers/models/zamba2/modeling_zamba2.py +4 -2
  485. transformers/models/zamba2/modular_zamba2.py +1 -1
  486. transformers/models/zoedepth/modeling_zoedepth.py +1 -0
  487. transformers/pipelines/__init__.py +2 -3
  488. transformers/pipelines/base.py +1 -9
  489. transformers/pipelines/document_question_answering.py +3 -1
  490. transformers/pipelines/text_generation.py +1 -1
  491. transformers/processing_utils.py +23 -11
  492. transformers/quantizers/base.py +35 -110
  493. transformers/quantizers/quantizer_aqlm.py +1 -5
  494. transformers/quantizers/quantizer_auto_round.py +1 -2
  495. transformers/quantizers/quantizer_awq.py +17 -81
  496. transformers/quantizers/quantizer_bitnet.py +3 -8
  497. transformers/quantizers/quantizer_bnb_4bit.py +13 -110
  498. transformers/quantizers/quantizer_bnb_8bit.py +16 -92
  499. transformers/quantizers/quantizer_compressed_tensors.py +1 -5
  500. transformers/quantizers/quantizer_eetq.py +14 -62
  501. transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
  502. transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
  503. transformers/quantizers/quantizer_fp_quant.py +48 -78
  504. transformers/quantizers/quantizer_gptq.py +7 -24
  505. transformers/quantizers/quantizer_higgs.py +40 -54
  506. transformers/quantizers/quantizer_hqq.py +144 -153
  507. transformers/quantizers/quantizer_mxfp4.py +13 -167
  508. transformers/quantizers/quantizer_quanto.py +20 -64
  509. transformers/quantizers/quantizer_quark.py +36 -17
  510. transformers/quantizers/quantizer_spqr.py +1 -4
  511. transformers/quantizers/quantizer_torchao.py +23 -202
  512. transformers/quantizers/quantizer_vptq.py +8 -22
  513. transformers/quantizers/quantizers_utils.py +20 -0
  514. transformers/testing_utils.py +297 -36
  515. transformers/tokenization_mistral_common.py +4 -0
  516. transformers/tokenization_utils_base.py +113 -222
  517. transformers/tokenization_utils_tokenizers.py +168 -107
  518. transformers/trainer.py +28 -31
  519. transformers/trainer_jit_checkpoint.py +126 -0
  520. transformers/trainer_utils.py +1 -1
  521. transformers/training_args.py +66 -28
  522. transformers/utils/__init__.py +3 -4
  523. transformers/utils/auto_docstring.py +1 -0
  524. transformers/utils/generic.py +27 -1
  525. transformers/utils/hub.py +5 -15
  526. transformers/utils/import_utils.py +61 -16
  527. transformers/utils/kernel_config.py +4 -2
  528. transformers/utils/loading_report.py +19 -10
  529. transformers/utils/quantization_config.py +75 -242
  530. transformers/video_processing_utils.py +1 -2
  531. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
  532. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
  533. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
  534. transformers/kernels/__init__.py +0 -0
  535. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  536. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  537. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
  538. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  539. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,7 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- from typing import Optional
16
+ from typing import Optional, Union
17
17
 
18
18
  from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
19
19
  from tokenizers.models import BPE
@@ -83,13 +83,15 @@ class NllbTokenizer(TokenizersBackend):
83
83
 
84
84
  vocab_files_names = VOCAB_FILES_NAMES
85
85
  model_input_names = ["input_ids", "attention_mask"]
86
- slow_tokenizer_class = None
86
+ model = BPE
87
87
 
88
88
  prefix_tokens: list[int] = []
89
89
  suffix_tokens: list[int] = []
90
90
 
91
91
  def __init__(
92
92
  self,
93
+ vocab: Optional[Union[str, dict[str, int]]] = None,
94
+ merges: Optional[Union[str, list[str]]] = None,
93
95
  bos_token="<s>",
94
96
  eos_token="</s>",
95
97
  sep_token="</s>",
@@ -101,16 +103,11 @@ class NllbTokenizer(TokenizersBackend):
101
103
  tgt_lang=None,
102
104
  additional_special_tokens=None,
103
105
  legacy_behaviour=False,
104
- vocab=None,
105
- merges=None,
106
- vocab_file=None,
107
106
  **kwargs,
108
107
  ):
109
108
  if additional_special_tokens is None:
110
109
  additional_special_tokens = kwargs.get("extra_special_tokens", FAIRSEQ_LANGUAGE_CODES)
111
110
 
112
- self.vocab_file = vocab_file
113
-
114
111
  mask_token = (
115
112
  AddedToken(mask_token, normalized=True, lstrip=True, special=True)
116
113
  if isinstance(mask_token, str)
@@ -118,23 +115,15 @@ class NllbTokenizer(TokenizersBackend):
118
115
  )
119
116
  self.legacy_behaviour = legacy_behaviour
120
117
 
121
- if vocab is not None:
122
- if isinstance(vocab, list):
123
- self._vocab = {token: idx for idx, (token, _score) in enumerate(vocab)}
124
- else:
125
- self._vocab = vocab
126
- else:
127
- self._vocab = {
118
+ if vocab is None:
119
+ vocab = {
128
120
  str(bos_token): 0,
129
121
  str(pad_token): 1,
130
122
  str(eos_token): 2,
131
123
  str(unk_token): 3,
132
124
  }
133
-
134
- if merges is None:
135
- self._merges = []
136
- else:
137
- self._merges = merges
125
+ self._vocab = vocab
126
+ self._merges = merges or []
138
127
 
139
128
  self._tokenizer = Tokenizer(
140
129
  BPE(
@@ -158,13 +147,10 @@ class NllbTokenizer(TokenizersBackend):
158
147
  self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)
159
148
  self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
160
149
 
161
- tokenizer_object = self._tokenizer
162
-
163
150
  # Remove extra_special_tokens from kwargs if present to avoid conflict
164
151
  kwargs.pop("extra_special_tokens", None)
165
152
 
166
153
  super().__init__(
167
- tokenizer_object=tokenizer_object,
168
154
  bos_token=bos_token,
169
155
  eos_token=eos_token,
170
156
  sep_token=sep_token,
@@ -380,16 +380,16 @@ class NougatTokenizer(TokenizersBackend):
380
380
  pad_token (`str`, *optional*, defaults to `"<pad>"`):
381
381
  The token used for padding, for example when batching sequences of different lengths.
382
382
 
383
- vocab (`dict`, *optional*):
383
+ vocab (`str`, `dict` or `list`, *optional*):
384
384
  Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
385
385
 
386
- merges (`list`, *optional*):
386
+ merges (`str` or `list`, *optional*):
387
387
  Custom merges list. If not provided, merges are loaded from merges_file.
388
388
  """
389
389
 
390
390
  vocab_files_names = VOCAB_FILES_NAMES
391
391
  model_input_names = ["input_ids", "attention_mask"]
392
- slow_tokenizer_class = None
392
+ model = BPE
393
393
 
394
394
  def __init__(
395
395
  self,
@@ -398,28 +398,22 @@ class NougatTokenizer(TokenizersBackend):
398
398
  bos_token: str = "<s>",
399
399
  eos_token: str = "</s>",
400
400
  pad_token: str = "<pad>",
401
- vocab: Optional[dict] = None,
402
- merges: Optional[list] = None,
401
+ vocab: Optional[Union[str, dict, list]] = None,
402
+ merges: Optional[Union[str, list]] = None,
403
403
  **kwargs,
404
404
  ):
405
- if vocab is not None:
406
- self._vocab = (
407
- {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
408
- )
409
- else:
410
- self._vocab = {
405
+ self._vocab = (
406
+ vocab
407
+ if vocab is not None
408
+ else {
411
409
  str(bos_token): 0,
412
410
  str(pad_token): 1,
413
411
  str(eos_token): 2,
414
412
  str(unk_token): 3,
415
413
  "[START_REF]": 4,
416
414
  }
417
-
418
- if merges is not None:
419
- self._merges = merges
420
- else:
421
- self._merges = []
422
-
415
+ )
416
+ self._merges = merges or []
423
417
  self._tokenizer = Tokenizer(
424
418
  BPE(
425
419
  vocab=self._vocab,
@@ -464,10 +458,7 @@ class NougatTokenizer(TokenizersBackend):
464
458
  self._tokenizer.enable_truncation(max_length=4096)
465
459
  self._tokenizer.enable_padding(length=4096, pad_id=pad_token_id, pad_token=str(pad_token))
466
460
 
467
- tokenizer_object = self._tokenizer
468
-
469
461
  super().__init__(
470
- tokenizer_object=tokenizer_object,
471
462
  errors=errors,
472
463
  unk_token=unk_token,
473
464
  bos_token=bos_token,
@@ -476,45 +467,6 @@ class NougatTokenizer(TokenizersBackend):
476
467
  **kwargs,
477
468
  )
478
469
 
479
- def _post_init(self):
480
- """Post-initialization to ensure tokenizer settings are applied correctly."""
481
- # Re-apply settings to ensure they're correct after loading from pretrained
482
- self._tokenizer.normalizer = normalizers.NFKC()
483
- self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
484
- [
485
- pre_tokenizers.Split(pattern="SPL1T-TH1S-Pl3A5E", behavior="removed", invert=False),
486
- pre_tokenizers.Digits(individual_digits=True),
487
- pre_tokenizers.Split(
488
- pattern=r"[\(\)\[\]\{\}]|([!\"#\$%\&'\*\+,\-\./:;<=>\?\\\^_`\|\~])\1*",
489
- behavior="isolated",
490
- invert=False,
491
- ),
492
- pre_tokenizers.Split(pattern="\n", behavior="isolated", invert=False),
493
- pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True),
494
- ]
495
- )
496
- self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
497
-
498
- # Set up post processor with bos and eos tokens
499
- bos_token_id = self.bos_token_id if self.bos_token_id is not None else 0
500
- eos_token_id = self.eos_token_id if self.eos_token_id is not None else 2
501
- pad_token_id = self.pad_token_id if self.pad_token_id is not None else 1
502
- self._tokenizer.post_processor = processors.TemplateProcessing(
503
- single=f"{self.bos_token}:0 $A:0 {self.eos_token}:0",
504
- pair="$A:0 $B:1",
505
- special_tokens=[
506
- (str(self.eos_token), eos_token_id),
507
- (str(self.bos_token), bos_token_id),
508
- ],
509
- )
510
-
511
- # Enable truncation and padding
512
- self._tokenizer.enable_truncation(max_length=4096)
513
- self._tokenizer.enable_padding(length=4096, pad_id=pad_token_id, pad_token=str(self.pad_token))
514
-
515
- # Call parent to handle AddedToken properties
516
- super()._post_init()
517
-
518
470
  def remove_hallucinated_references(self, text: str) -> str:
519
471
  """
520
472
  Remove hallucinated or missing references from the text.
@@ -443,6 +443,7 @@ class NystromformerModel(NystromformerPreTrainedModel):
443
443
  output_attentions: Optional[bool] = None,
444
444
  output_hidden_states: Optional[bool] = None,
445
445
  return_dict: Optional[bool] = None,
446
+ **kwargs,
446
447
  ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
447
448
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
448
449
  output_hidden_states = (
@@ -539,6 +540,7 @@ class NystromformerForMaskedLM(NystromformerPreTrainedModel):
539
540
  output_attentions: Optional[bool] = None,
540
541
  output_hidden_states: Optional[bool] = None,
541
542
  return_dict: Optional[bool] = None,
543
+ **kwargs,
542
544
  ) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
543
545
  r"""
544
546
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -628,6 +630,7 @@ class NystromformerForSequenceClassification(NystromformerPreTrainedModel):
628
630
  output_attentions: Optional[bool] = None,
629
631
  output_hidden_states: Optional[bool] = None,
630
632
  return_dict: Optional[bool] = None,
633
+ **kwargs,
631
634
  ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
632
635
  r"""
633
636
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -709,6 +712,7 @@ class NystromformerForMultipleChoice(NystromformerPreTrainedModel):
709
712
  output_attentions: Optional[bool] = None,
710
713
  output_hidden_states: Optional[bool] = None,
711
714
  return_dict: Optional[bool] = None,
715
+ **kwargs,
712
716
  ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]:
713
717
  r"""
714
718
  input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
@@ -814,6 +818,7 @@ class NystromformerForTokenClassification(NystromformerPreTrainedModel):
814
818
  output_attentions: Optional[bool] = None,
815
819
  output_hidden_states: Optional[bool] = None,
816
820
  return_dict: Optional[bool] = None,
821
+ **kwargs,
817
822
  ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
818
823
  r"""
819
824
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -881,6 +886,7 @@ class NystromformerForQuestionAnswering(NystromformerPreTrainedModel):
881
886
  output_attentions: Optional[bool] = None,
882
887
  output_hidden_states: Optional[bool] = None,
883
888
  return_dict: Optional[bool] = None,
889
+ **kwargs,
884
890
  ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
885
891
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
886
892
 
@@ -34,6 +34,7 @@ import torch.nn.functional as F
34
34
  from ...activations import ACT2FN
35
35
  from ...cache_utils import Cache, DynamicCache
36
36
  from ...generation import GenerationMixin
37
+ from ...integrations import use_kernelized_func
37
38
  from ...masking_utils import create_causal_mask
38
39
  from ...modeling_layers import GradientCheckpointingLayer
39
40
  from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -41,7 +42,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
41
42
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
42
43
  from ...processing_utils import Unpack
43
44
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
44
- from ...utils.generic import check_model_inputs
45
+ from ...utils.generic import check_model_inputs, maybe_autocast
45
46
  from .configuration_olmo import OlmoConfig
46
47
 
47
48
 
@@ -131,7 +132,7 @@ class OlmoRotaryEmbedding(nn.Module):
131
132
  position_ids_expanded = position_ids[:, None, :].float()
132
133
 
133
134
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
134
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
135
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
135
136
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
136
137
  emb = torch.cat((freqs, freqs), dim=-1)
137
138
  cos = emb.cos() * self.attention_scaling
@@ -212,6 +213,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
212
213
  return q_embed.to(q_type), k_embed.to(k_type)
213
214
 
214
215
 
216
+ @use_kernelized_func(apply_rotary_pos_emb)
215
217
  class OlmoAttention(nn.Module):
216
218
  """Multi-headed attention from 'Attention Is All You Need' paper"""
217
219
 
@@ -237,7 +239,6 @@ class OlmoAttention(nn.Module):
237
239
  self.o_proj = nn.Linear(
238
240
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
239
241
  )
240
- self.rotary_fn = apply_rotary_pos_emb
241
242
 
242
243
  def forward(
243
244
  self,
@@ -246,7 +247,6 @@ class OlmoAttention(nn.Module):
246
247
  attention_mask: Optional[torch.Tensor],
247
248
  past_key_values: Optional[Cache] = None,
248
249
  cache_position: Optional[torch.LongTensor] = None,
249
- position_ids: Optional[torch.LongTensor] = None,
250
250
  **kwargs,
251
251
  ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
252
252
  input_shape = hidden_states.shape[:-1]
@@ -29,6 +29,7 @@ from ...cache_utils import Cache
29
29
  from ...modeling_rope_utils import dynamic_rope_update
30
30
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
31
31
  from ...utils import logging
32
+ from ...utils.generic import maybe_autocast
32
33
  from ..llama.modeling_llama import (
33
34
  LlamaAttention,
34
35
  LlamaDecoderLayer,
@@ -77,7 +78,7 @@ class OlmoRotaryEmbedding(LlamaRotaryEmbedding):
77
78
  position_ids_expanded = position_ids[:, None, :].float()
78
79
 
79
80
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
80
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
81
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
81
82
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
82
83
  emb = torch.cat((freqs, freqs), dim=-1)
83
84
  cos = emb.cos() * self.attention_scaling
@@ -121,7 +122,6 @@ class OlmoAttention(LlamaAttention):
121
122
  attention_mask: Optional[torch.Tensor],
122
123
  past_key_values: Optional[Cache] = None,
123
124
  cache_position: Optional[torch.LongTensor] = None,
124
- position_ids: Optional[torch.LongTensor] = None,
125
125
  **kwargs,
126
126
  ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
127
127
  input_shape = hidden_states.shape[:-1]
@@ -35,7 +35,7 @@ from transformers.utils.generic import TransformersKwargs
35
35
  from ...activations import ACT2FN
36
36
  from ...cache_utils import Cache, DynamicCache
37
37
  from ...generation import GenerationMixin
38
- from ...integrations import use_kernel_forward_from_hub
38
+ from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
39
39
  from ...masking_utils import create_causal_mask
40
40
  from ...modeling_layers import GradientCheckpointingLayer
41
41
  from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -43,7 +43,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
43
43
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
44
44
  from ...processing_utils import Unpack
45
45
  from ...utils import auto_docstring, can_return_tuple
46
- from ...utils.generic import check_model_inputs
46
+ from ...utils.generic import check_model_inputs, maybe_autocast
47
47
  from .configuration_olmo2 import Olmo2Config
48
48
 
49
49
 
@@ -124,7 +124,7 @@ class Olmo2RotaryEmbedding(nn.Module):
124
124
  position_ids_expanded = position_ids[:, None, :].float()
125
125
 
126
126
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
127
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
127
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
128
128
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
129
129
  emb = torch.cat((freqs, freqs), dim=-1)
130
130
  cos = emb.cos() * self.attention_scaling
@@ -205,6 +205,7 @@ def rotate_half(x):
205
205
  return torch.cat((-x2, x1), dim=-1)
206
206
 
207
207
 
208
+ @use_kernelized_func(apply_rotary_pos_emb)
208
209
  class Olmo2Attention(nn.Module):
209
210
  """Multi-headed attention from 'Attention Is All You Need' paper"""
210
211
 
@@ -230,7 +231,6 @@ class Olmo2Attention(nn.Module):
230
231
  self.o_proj = nn.Linear(
231
232
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
232
233
  )
233
- self.rotary_fn = apply_rotary_pos_emb
234
234
  self.q_norm = Olmo2RMSNorm(config.num_attention_heads * self.head_dim, config.rms_norm_eps)
235
235
  self.k_norm = Olmo2RMSNorm(config.num_key_value_heads * self.head_dim, config.rms_norm_eps)
236
236
 
@@ -241,7 +241,6 @@ class Olmo2Attention(nn.Module):
241
241
  attention_mask: Optional[torch.Tensor],
242
242
  past_key_values: Optional[Cache] = None,
243
243
  cache_position: Optional[torch.LongTensor] = None,
244
- position_ids: Optional[torch.LongTensor] = None,
245
244
  **kwargs: Unpack[TransformersKwargs],
246
245
  ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
247
246
  input_shape = hidden_states.shape[:-1]
@@ -219,7 +219,6 @@ class Olmo2Attention(OlmoAttention):
219
219
  attention_mask: Optional[torch.Tensor],
220
220
  past_key_values: Optional[Cache] = None,
221
221
  cache_position: Optional[torch.LongTensor] = None,
222
- position_ids: Optional[torch.LongTensor] = None,
223
222
  **kwargs: Unpack[TransformersKwargs],
224
223
  ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
225
224
  input_shape = hidden_states.shape[:-1]
@@ -30,7 +30,7 @@ from transformers.utils.generic import TransformersKwargs
30
30
  from ...activations import ACT2FN
31
31
  from ...cache_utils import Cache, DynamicCache
32
32
  from ...generation import GenerationMixin
33
- from ...integrations import use_kernel_forward_from_hub
33
+ from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
34
34
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
35
35
  from ...modeling_layers import GradientCheckpointingLayer
36
36
  from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -38,7 +38,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
38
38
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
39
39
  from ...processing_utils import Unpack
40
40
  from ...utils import auto_docstring, can_return_tuple
41
- from ...utils.generic import check_model_inputs
41
+ from ...utils.generic import check_model_inputs, maybe_autocast
42
42
  from .configuration_olmo3 import Olmo3Config
43
43
 
44
44
 
@@ -136,6 +136,7 @@ def rotate_half(x):
136
136
  return torch.cat((-x2, x1), dim=-1)
137
137
 
138
138
 
139
+ @use_kernelized_func(apply_rotary_pos_emb)
139
140
  class Olmo3Attention(nn.Module):
140
141
  """Multi-headed attention from 'Attention Is All You Need' paper"""
141
142
 
@@ -161,7 +162,6 @@ class Olmo3Attention(nn.Module):
161
162
  self.o_proj = nn.Linear(
162
163
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
163
164
  )
164
- self.rotary_fn = apply_rotary_pos_emb
165
165
  self.q_norm = Olmo3RMSNorm(config.num_attention_heads * self.head_dim, config.rms_norm_eps)
166
166
  self.k_norm = Olmo3RMSNorm(config.num_key_value_heads * self.head_dim, config.rms_norm_eps)
167
167
  assert config.layer_types is not None
@@ -332,7 +332,7 @@ class Olmo3RotaryEmbedding(nn.Module):
332
332
  position_ids_expanded = position_ids[:, None, :].float()
333
333
 
334
334
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
335
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
335
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
336
336
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
337
337
  emb = torch.cat((freqs, freqs), dim=-1)
338
338
  cos = emb.cos() * self.attention_scaling
@@ -27,7 +27,7 @@ from ... import initialization as init
27
27
  from ...activations import ACT2FN
28
28
  from ...cache_utils import Cache, DynamicCache
29
29
  from ...generation import GenerationMixin
30
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
30
+ from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
31
31
  from ...masking_utils import create_causal_mask
32
32
  from ...modeling_layers import GradientCheckpointingLayer
33
33
  from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
@@ -35,7 +35,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
35
35
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
36
36
  from ...processing_utils import Unpack
37
37
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
38
- from ...utils.generic import OutputRecorder, check_model_inputs
38
+ from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
39
39
  from .configuration_olmoe import OlmoeConfig
40
40
 
41
41
 
@@ -116,7 +116,7 @@ class OlmoeRotaryEmbedding(nn.Module):
116
116
  position_ids_expanded = position_ids[:, None, :].float()
117
117
 
118
118
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
119
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
119
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
120
120
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
121
121
  emb = torch.cat((freqs, freqs), dim=-1)
122
122
  cos = emb.cos() * self.attention_scaling
@@ -214,6 +214,7 @@ def eager_attention_forward(
214
214
  return attn_output, attn_weights
215
215
 
216
216
 
217
+ @use_kernelized_func(apply_rotary_pos_emb)
217
218
  class OlmoeAttention(nn.Module):
218
219
  """Multi-headed attention from 'Attention Is All You Need' paper"""
219
220
 
@@ -239,7 +240,6 @@ class OlmoeAttention(nn.Module):
239
240
  self.o_proj = nn.Linear(
240
241
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
241
242
  )
242
- self.rotary_fn = apply_rotary_pos_emb
243
243
  self.q_norm = OlmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
244
244
  self.k_norm = OlmoeRMSNorm(
245
245
  (config.hidden_size // config.num_attention_heads) * config.num_key_value_heads, eps=config.rms_norm_eps
@@ -1316,6 +1316,7 @@ class OmDetTurboDecoder(OmDetTurboPreTrainedModel):
1316
1316
  output_attentions=None,
1317
1317
  output_hidden_states=None,
1318
1318
  return_dict=None,
1319
+ **kwargs,
1319
1320
  ):
1320
1321
  """
1321
1322
  Args:
@@ -1505,6 +1506,7 @@ class OmDetTurboForObjectDetection(OmDetTurboPreTrainedModel):
1505
1506
  output_attentions: Optional[bool] = None,
1506
1507
  output_hidden_states: Optional[bool] = None,
1507
1508
  return_dict: Optional[bool] = None,
1509
+ **kwargs,
1508
1510
  ) -> Union[tuple[torch.FloatTensor], OmDetTurboObjectDetectionOutput]:
1509
1511
  r"""
1510
1512
  classes_input_ids (`torch.LongTensor` of shape `(total_classes (>= batch_size), sequence_length)`):
@@ -39,6 +39,7 @@ from ...utils import (
39
39
  requires_backends,
40
40
  )
41
41
  from ...utils.backbone_utils import load_backbone
42
+ from ...utils.generic import maybe_autocast
42
43
  from .configuration_oneformer import OneFormerConfig
43
44
 
44
45
 
@@ -322,7 +323,7 @@ class OneFormerHungarianMatcher(nn.Module):
322
323
  align_corners=False,
323
324
  ).squeeze(1)
324
325
 
325
- with torch.autocast(device_type="cuda", enabled=False):
326
+ with maybe_autocast(device_type="cuda", enabled=False):
326
327
  pred_mask = pred_mask.float()
327
328
  target_mask = target_mask.float()
328
329
 
@@ -2872,6 +2873,7 @@ class OneFormerModel(OneFormerPreTrainedModel):
2872
2873
  output_hidden_states: Optional[bool] = None,
2873
2874
  output_attentions: Optional[bool] = None,
2874
2875
  return_dict: Optional[bool] = None,
2876
+ **kwargs,
2875
2877
  ) -> OneFormerModelOutput:
2876
2878
  r"""
2877
2879
  task_inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -3058,6 +3060,7 @@ class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel):
3058
3060
  output_hidden_states: Optional[bool] = None,
3059
3061
  output_attentions: Optional[bool] = None,
3060
3062
  return_dict: Optional[bool] = None,
3063
+ **kwargs,
3061
3064
  ) -> OneFormerForUniversalSegmentationOutput:
3062
3065
  r"""
3063
3066
  task_inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -317,6 +317,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
317
317
  output_attentions: Optional[bool] = None,
318
318
  output_hidden_states: Optional[bool] = None,
319
319
  return_dict: Optional[bool] = None,
320
+ **kwargs,
320
321
  ) -> Union[tuple[torch.Tensor], BaseModelOutput]:
321
322
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
322
323
  output_hidden_states = (
@@ -514,6 +515,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
514
515
  output_attentions: Optional[bool] = None,
515
516
  output_hidden_states: Optional[bool] = None,
516
517
  return_dict: Optional[bool] = None,
518
+ **kwargs,
517
519
  ) -> Union[tuple[torch.Tensor], OpenAIGPTDoubleHeadsModelOutput]:
518
520
  r"""
519
521
  mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
@@ -624,6 +626,7 @@ class OpenAIGPTForSequenceClassification(OpenAIGPTPreTrainedModel):
624
626
  output_attentions: Optional[bool] = None,
625
627
  output_hidden_states: Optional[bool] = None,
626
628
  return_dict: Optional[bool] = None,
629
+ **kwargs,
627
630
  ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
628
631
  r"""
629
632
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -14,10 +14,11 @@
14
14
  # limitations under the License.
15
15
  """Tokenization classes for OpenAI GPT."""
16
16
 
17
+ from typing import Optional, Union
18
+
17
19
  from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
18
20
  from tokenizers.models import BPE
19
21
 
20
- from ...convert_slow_tokenizer import generate_merges
21
22
  from ...tokenization_utils_tokenizers import TokenizersBackend
22
23
  from ...utils import logging
23
24
 
@@ -48,40 +49,26 @@ class OpenAIGPTTokenizer(TokenizersBackend):
48
49
  unk_token (`str`, *optional*, defaults to `"<unk>"`):
49
50
  The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
50
51
  token instead.
51
- vocab (`dict`, *optional*):
52
+ vocab (`str` or `dict[str, int]`, *optional*):
52
53
  Custom vocabulary dictionary. If not provided, a blank vocabulary is initialized.
53
- merges (`list`, *optional*):
54
+ merges (`str` or `list[str]`, *optional*):
54
55
  Custom merges list. If not provided, an empty list is used.
55
56
  """
56
57
 
57
58
  vocab_files_names = VOCAB_FILES_NAMES
58
59
  model_input_names = ["input_ids", "attention_mask"]
60
+ model = BPE
59
61
 
60
62
  def __init__(
61
63
  self,
62
- unk_token="<unk>",
63
- vocab=None,
64
- merges=None,
65
- vocab_file=None,
66
- merges_file=None,
64
+ vocab: Optional[Union[str, dict[str, int]]] = None,
65
+ merges: Optional[Union[str, list[str]]] = None,
66
+ unk_token: str = "<unk>",
67
67
  **kwargs,
68
68
  ):
69
- # Initialize vocabulary
70
- if vocab is not None:
71
- self._vocab = (
72
- {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
73
- )
74
- else:
75
- # Initialize minimal vocabulary with unk token
76
- self._vocab = {str(unk_token): 0}
77
-
78
- # Initialize merges
79
- if merges is not None:
80
- self._merges = merges if merges is not None else generate_merges(self._vocab)
81
- else:
82
- self._merges = []
69
+ self._vocab = vocab if vocab is not None else {str(unk_token): 0}
70
+ self._merges = merges or []
83
71
 
84
- # Create BPE tokenizer
85
72
  self._tokenizer = Tokenizer(
86
73
  BPE(
87
74
  vocab=self._vocab,
@@ -107,34 +94,11 @@ class OpenAIGPTTokenizer(TokenizersBackend):
107
94
  self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
108
95
  self._tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")
109
96
 
110
- tokenizer_object = self._tokenizer
111
-
112
97
  super().__init__(
113
- tokenizer_object=tokenizer_object,
114
98
  unk_token=unk_token,
115
99
  **kwargs,
116
100
  )
117
101
 
118
- self.vocab_file = vocab_file
119
- self.merges_file = merges_file
120
-
121
- def _post_init(self):
122
- """Post-initialization to ensure tokenizer settings are applied correctly."""
123
- # Re-apply settings to ensure they're correct after loading from pretrained
124
- self._tokenizer.normalizer = normalizers.Sequence(
125
- [
126
- normalizers.NFD(),
127
- normalizers.Lowercase(),
128
- normalizers.StripAccents(),
129
- ]
130
- )
131
-
132
- self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
133
- self._tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")
134
-
135
- # Call parent to handle AddedToken properties
136
- super()._post_init()
137
-
138
102
  @property
139
103
  def do_lower_case(self):
140
104
  return True
@@ -836,6 +836,7 @@ class OPTForSequenceClassification(OPTPreTrainedModel):
836
836
  output_hidden_states: Optional[bool] = None,
837
837
  return_dict: Optional[bool] = None,
838
838
  position_ids: Optional[torch.LongTensor] = None,
839
+ **kwargs,
839
840
  ) -> Union[tuple, SequenceClassifierOutputWithPast]:
840
841
  r"""
841
842
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -947,6 +948,7 @@ class OPTForQuestionAnswering(OPTPreTrainedModel):
947
948
  output_hidden_states: Optional[bool] = None,
948
949
  return_dict: Optional[bool] = None,
949
950
  position_ids: Optional[torch.LongTensor] = None,
951
+ **kwargs,
950
952
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
951
953
  r"""
952
954
  Example: