transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. transformers/__init__.py +30 -3
  2. transformers/cli/serve.py +47 -17
  3. transformers/conversion_mapping.py +15 -2
  4. transformers/convert_slow_tokenizer.py +225 -10
  5. transformers/core_model_loading.py +196 -135
  6. transformers/data/data_collator.py +12 -4
  7. transformers/dependency_versions_table.py +1 -2
  8. transformers/dynamic_module_utils.py +1 -2
  9. transformers/feature_extraction_utils.py +1 -2
  10. transformers/file_utils.py +0 -1
  11. transformers/generation/__init__.py +11 -1
  12. transformers/generation/configuration_utils.py +3 -2
  13. transformers/generation/continuous_batching/__init__.py +4 -0
  14. transformers/generation/continuous_batching/continuous_api.py +134 -79
  15. transformers/image_processing_base.py +1 -2
  16. transformers/integrations/__init__.py +4 -2
  17. transformers/integrations/accelerate.py +15 -3
  18. transformers/integrations/aqlm.py +38 -66
  19. transformers/integrations/awq.py +48 -514
  20. transformers/integrations/bitnet.py +45 -100
  21. transformers/integrations/bitsandbytes.py +79 -191
  22. transformers/integrations/deepspeed.py +1 -0
  23. transformers/integrations/eetq.py +84 -79
  24. transformers/integrations/fbgemm_fp8.py +191 -145
  25. transformers/integrations/finegrained_fp8.py +236 -193
  26. transformers/integrations/fp_quant.py +92 -0
  27. transformers/integrations/ggml.py +11 -1
  28. transformers/integrations/higgs.py +40 -62
  29. transformers/integrations/hub_kernels.py +42 -3
  30. transformers/integrations/integration_utils.py +10 -0
  31. transformers/integrations/mxfp4.py +25 -65
  32. transformers/integrations/peft.py +7 -29
  33. transformers/integrations/quanto.py +73 -55
  34. transformers/integrations/quark.py +55 -0
  35. transformers/integrations/spqr.py +44 -90
  36. transformers/integrations/torchao.py +32 -38
  37. transformers/integrations/vptq.py +42 -59
  38. transformers/modelcard.py +1 -2
  39. transformers/modeling_gguf_pytorch_utils.py +8 -0
  40. transformers/modeling_rope_utils.py +30 -6
  41. transformers/modeling_utils.py +116 -112
  42. transformers/models/__init__.py +3 -0
  43. transformers/models/afmoe/modeling_afmoe.py +4 -4
  44. transformers/models/albert/tokenization_albert.py +6 -12
  45. transformers/models/align/modeling_align.py +2 -0
  46. transformers/models/altclip/modeling_altclip.py +4 -0
  47. transformers/models/apertus/modeling_apertus.py +4 -4
  48. transformers/models/arcee/modeling_arcee.py +4 -4
  49. transformers/models/aria/modeling_aria.py +4 -4
  50. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  51. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  52. transformers/models/auto/configuration_auto.py +11 -0
  53. transformers/models/auto/feature_extraction_auto.py +2 -0
  54. transformers/models/auto/image_processing_auto.py +1 -0
  55. transformers/models/auto/modeling_auto.py +6 -0
  56. transformers/models/auto/processing_auto.py +18 -10
  57. transformers/models/auto/tokenization_auto.py +74 -472
  58. transformers/models/autoformer/modeling_autoformer.py +4 -0
  59. transformers/models/bamba/modeling_bamba.py +4 -3
  60. transformers/models/bark/modeling_bark.py +2 -0
  61. transformers/models/bart/modeling_bart.py +7 -0
  62. transformers/models/barthez/tokenization_barthez.py +5 -10
  63. transformers/models/beit/modeling_beit.py +6 -1
  64. transformers/models/bert/tokenization_bert.py +8 -21
  65. transformers/models/big_bird/modeling_big_bird.py +6 -0
  66. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  67. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
  68. transformers/models/biogpt/modeling_biogpt.py +2 -0
  69. transformers/models/biogpt/modular_biogpt.py +2 -0
  70. transformers/models/bit/modeling_bit.py +11 -2
  71. transformers/models/bitnet/modeling_bitnet.py +4 -4
  72. transformers/models/blenderbot/modeling_blenderbot.py +5 -0
  73. transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
  74. transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
  75. transformers/models/blip/modeling_blip_text.py +2 -0
  76. transformers/models/blip_2/modeling_blip_2.py +2 -1
  77. transformers/models/bloom/modeling_bloom.py +4 -0
  78. transformers/models/blt/modeling_blt.py +2 -2
  79. transformers/models/blt/modular_blt.py +2 -2
  80. transformers/models/bridgetower/modeling_bridgetower.py +5 -1
  81. transformers/models/bros/modeling_bros.py +4 -0
  82. transformers/models/camembert/tokenization_camembert.py +8 -12
  83. transformers/models/canine/modeling_canine.py +5 -0
  84. transformers/models/chameleon/modeling_chameleon.py +2 -1
  85. transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
  86. transformers/models/clap/modeling_clap.py +5 -0
  87. transformers/models/clip/tokenization_clip.py +22 -44
  88. transformers/models/clipseg/modeling_clipseg.py +5 -0
  89. transformers/models/clvp/modeling_clvp.py +5 -0
  90. transformers/models/clvp/tokenization_clvp.py +1 -63
  91. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  92. transformers/models/codegen/tokenization_codegen.py +14 -43
  93. transformers/models/cohere/modeling_cohere.py +4 -3
  94. transformers/models/cohere/modular_cohere.py +2 -1
  95. transformers/models/cohere/tokenization_cohere.py +12 -42
  96. transformers/models/cohere2/modeling_cohere2.py +7 -6
  97. transformers/models/cohere2/modular_cohere2.py +5 -5
  98. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
  99. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  100. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  101. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  102. transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
  103. transformers/models/convbert/modeling_convbert.py +6 -0
  104. transformers/models/convnext/modeling_convnext.py +2 -4
  105. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  106. transformers/models/csm/modeling_csm.py +4 -3
  107. transformers/models/ctrl/modeling_ctrl.py +1 -0
  108. transformers/models/cvt/modeling_cvt.py +2 -0
  109. transformers/models/cwm/modeling_cwm.py +4 -4
  110. transformers/models/d_fine/modeling_d_fine.py +2 -0
  111. transformers/models/d_fine/modular_d_fine.py +1 -0
  112. transformers/models/dab_detr/modeling_dab_detr.py +4 -0
  113. transformers/models/dac/modeling_dac.py +2 -2
  114. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  115. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  116. transformers/models/dbrx/modeling_dbrx.py +2 -2
  117. transformers/models/deberta/modeling_deberta.py +5 -0
  118. transformers/models/deberta/tokenization_deberta.py +11 -20
  119. transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
  120. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  121. transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
  122. transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
  123. transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
  124. transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
  125. transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
  126. transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
  127. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  128. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  129. transformers/models/detr/modeling_detr.py +5 -0
  130. transformers/models/dia/modeling_dia.py +4 -3
  131. transformers/models/dia/modular_dia.py +0 -1
  132. transformers/models/diffllama/modeling_diffllama.py +2 -2
  133. transformers/models/dinat/modeling_dinat.py +3 -0
  134. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  135. transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
  136. transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
  137. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  138. transformers/models/doge/modeling_doge.py +2 -3
  139. transformers/models/doge/modular_doge.py +0 -1
  140. transformers/models/donut/modeling_donut_swin.py +2 -0
  141. transformers/models/dots1/modeling_dots1.py +10 -7
  142. transformers/models/dots1/modular_dots1.py +5 -3
  143. transformers/models/dpr/modeling_dpr.py +5 -0
  144. transformers/models/dpr/tokenization_dpr.py +12 -0
  145. transformers/models/edgetam/modeling_edgetam.py +1 -1
  146. transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
  147. transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
  148. transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
  149. transformers/models/efficientnet/modeling_efficientnet.py +2 -0
  150. transformers/models/emu3/modeling_emu3.py +4 -4
  151. transformers/models/eomt/image_processing_eomt.py +13 -1
  152. transformers/models/eomt/image_processing_eomt_fast.py +14 -2
  153. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  154. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  155. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
  156. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
  157. transformers/models/esm/modeling_esmfold.py +5 -4
  158. transformers/models/evolla/modeling_evolla.py +4 -4
  159. transformers/models/exaone4/modeling_exaone4.py +2 -2
  160. transformers/models/exaone4/modular_exaone4.py +0 -1
  161. transformers/models/falcon/modeling_falcon.py +6 -1
  162. transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
  163. transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
  164. transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
  165. transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
  166. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  167. transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
  168. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  169. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
  170. transformers/models/flaubert/modeling_flaubert.py +7 -0
  171. transformers/models/flava/modeling_flava.py +6 -1
  172. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
  173. transformers/models/florence2/modeling_florence2.py +2 -1
  174. transformers/models/florence2/modular_florence2.py +2 -1
  175. transformers/models/fnet/modeling_fnet.py +7 -0
  176. transformers/models/focalnet/modeling_focalnet.py +4 -0
  177. transformers/models/fsmt/modeling_fsmt.py +2 -0
  178. transformers/models/funnel/modeling_funnel.py +8 -0
  179. transformers/models/funnel/tokenization_funnel.py +17 -24
  180. transformers/models/fuyu/processing_fuyu.py +3 -3
  181. transformers/models/gemma/modeling_gemma.py +4 -4
  182. transformers/models/gemma/tokenization_gemma.py +10 -27
  183. transformers/models/gemma2/modeling_gemma2.py +4 -4
  184. transformers/models/gemma2/modular_gemma2.py +2 -1
  185. transformers/models/gemma3/modeling_gemma3.py +14 -84
  186. transformers/models/gemma3/modular_gemma3.py +12 -81
  187. transformers/models/gemma3n/modeling_gemma3n.py +18 -209
  188. transformers/models/gemma3n/modular_gemma3n.py +17 -59
  189. transformers/models/git/modeling_git.py +2 -0
  190. transformers/models/glm/modeling_glm.py +4 -4
  191. transformers/models/glm4/modeling_glm4.py +4 -4
  192. transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
  193. transformers/models/glm4v/configuration_glm4v.py +3 -1
  194. transformers/models/glm4v/modeling_glm4v.py +3 -3
  195. transformers/models/glm4v/modular_glm4v.py +6 -4
  196. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  197. transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
  198. transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
  199. transformers/models/glpn/modeling_glpn.py +2 -0
  200. transformers/models/gpt2/modeling_gpt2.py +5 -1
  201. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  202. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
  203. transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
  204. transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
  205. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  206. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  207. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
  208. transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
  209. transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
  210. transformers/models/gptj/modeling_gptj.py +3 -0
  211. transformers/models/granite/modeling_granite.py +4 -4
  212. transformers/models/granitemoe/modeling_granitemoe.py +4 -6
  213. transformers/models/granitemoe/modular_granitemoe.py +0 -2
  214. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
  215. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
  216. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
  217. transformers/models/groupvit/modeling_groupvit.py +3 -0
  218. transformers/models/helium/modeling_helium.py +4 -3
  219. transformers/models/herbert/tokenization_herbert.py +9 -25
  220. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
  221. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
  222. transformers/models/hiera/modeling_hiera.py +4 -0
  223. transformers/models/hubert/modeling_hubert.py +3 -0
  224. transformers/models/hubert/modular_hubert.py +1 -0
  225. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
  226. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
  227. transformers/models/ibert/modeling_ibert.py +6 -0
  228. transformers/models/idefics/modeling_idefics.py +5 -21
  229. transformers/models/imagegpt/modeling_imagegpt.py +2 -1
  230. transformers/models/informer/modeling_informer.py +4 -0
  231. transformers/models/informer/modular_informer.py +1 -0
  232. transformers/models/internvl/modeling_internvl.py +2 -4
  233. transformers/models/internvl/modular_internvl.py +2 -4
  234. transformers/models/jamba/modeling_jamba.py +2 -2
  235. transformers/models/janus/modeling_janus.py +1 -0
  236. transformers/models/janus/modular_janus.py +1 -0
  237. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  238. transformers/models/kosmos2/modeling_kosmos2.py +1 -0
  239. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
  240. transformers/models/lasr/__init__.py +29 -0
  241. transformers/models/lasr/configuration_lasr.py +244 -0
  242. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  243. transformers/models/lasr/modeling_lasr.py +729 -0
  244. transformers/models/lasr/modular_lasr.py +569 -0
  245. transformers/models/lasr/processing_lasr.py +96 -0
  246. transformers/models/lasr/tokenization_lasr.py +186 -0
  247. transformers/models/layoutlm/modeling_layoutlm.py +5 -0
  248. transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
  249. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
  250. transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
  251. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  252. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  253. transformers/models/led/modeling_led.py +6 -0
  254. transformers/models/levit/modeling_levit.py +3 -0
  255. transformers/models/lfm2/modeling_lfm2.py +4 -5
  256. transformers/models/lfm2/modular_lfm2.py +0 -1
  257. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
  258. transformers/models/lightglue/modeling_lightglue.py +3 -1
  259. transformers/models/lightglue/modular_lightglue.py +1 -0
  260. transformers/models/lilt/modeling_lilt.py +4 -0
  261. transformers/models/llama/modeling_llama.py +4 -4
  262. transformers/models/llama/tokenization_llama.py +15 -43
  263. transformers/models/llama4/modeling_llama4.py +3 -2
  264. transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
  265. transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
  266. transformers/models/longformer/modeling_longformer.py +6 -0
  267. transformers/models/longt5/modeling_longt5.py +4 -0
  268. transformers/models/luke/modeling_luke.py +9 -0
  269. transformers/models/luke/tokenization_luke.py +11 -38
  270. transformers/models/lxmert/modeling_lxmert.py +2 -0
  271. transformers/models/m2m_100/modeling_m2m_100.py +4 -0
  272. transformers/models/mamba/modeling_mamba.py +14 -22
  273. transformers/models/marian/modeling_marian.py +5 -0
  274. transformers/models/markuplm/modeling_markuplm.py +4 -0
  275. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  276. transformers/models/mask2former/modeling_mask2former.py +2 -0
  277. transformers/models/maskformer/modeling_maskformer.py +2 -0
  278. transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
  279. transformers/models/mbart/modeling_mbart.py +7 -0
  280. transformers/models/mbart/tokenization_mbart.py +11 -52
  281. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  282. transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
  283. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  284. transformers/models/mimi/modeling_mimi.py +3 -1
  285. transformers/models/minimax/modeling_minimax.py +4 -4
  286. transformers/models/ministral/modeling_ministral.py +4 -4
  287. transformers/models/ministral3/configuration_ministral3.py +1 -1
  288. transformers/models/ministral3/modeling_ministral3.py +4 -3
  289. transformers/models/mistral/modeling_mistral.py +4 -3
  290. transformers/models/mixtral/modeling_mixtral.py +4 -4
  291. transformers/models/mllama/modeling_mllama.py +2 -2
  292. transformers/models/mluke/tokenization_mluke.py +6 -6
  293. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
  294. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  295. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  296. transformers/models/mobilevit/modeling_mobilevit.py +3 -0
  297. transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
  298. transformers/models/modernbert/modeling_modernbert.py +4 -1
  299. transformers/models/modernbert/modular_modernbert.py +2 -0
  300. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
  301. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
  302. transformers/models/moonshine/modeling_moonshine.py +4 -2
  303. transformers/models/moshi/modeling_moshi.py +5 -2
  304. transformers/models/mpnet/modeling_mpnet.py +5 -0
  305. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  306. transformers/models/mpt/modeling_mpt.py +2 -0
  307. transformers/models/mra/modeling_mra.py +6 -0
  308. transformers/models/mt5/modeling_mt5.py +7 -0
  309. transformers/models/musicgen/modeling_musicgen.py +2 -0
  310. transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
  311. transformers/models/mvp/modeling_mvp.py +7 -0
  312. transformers/models/nanochat/modeling_nanochat.py +4 -4
  313. transformers/models/nemotron/modeling_nemotron.py +4 -2
  314. transformers/models/nllb/tokenization_nllb.py +8 -22
  315. transformers/models/nougat/tokenization_nougat.py +11 -59
  316. transformers/models/nystromformer/modeling_nystromformer.py +6 -0
  317. transformers/models/olmo/modeling_olmo.py +4 -4
  318. transformers/models/olmo/modular_olmo.py +2 -2
  319. transformers/models/olmo2/modeling_olmo2.py +4 -5
  320. transformers/models/olmo2/modular_olmo2.py +0 -1
  321. transformers/models/olmo3/modeling_olmo3.py +4 -4
  322. transformers/models/olmoe/modeling_olmoe.py +4 -4
  323. transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
  324. transformers/models/oneformer/modeling_oneformer.py +4 -1
  325. transformers/models/openai/modeling_openai.py +3 -0
  326. transformers/models/openai/tokenization_openai.py +10 -46
  327. transformers/models/opt/modeling_opt.py +2 -0
  328. transformers/models/owlv2/modeling_owlv2.py +4 -0
  329. transformers/models/owlvit/modeling_owlvit.py +4 -0
  330. transformers/models/paddleocr_vl/__init__.py +32 -0
  331. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  332. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
  333. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  334. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
  335. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
  336. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  337. transformers/models/parakeet/configuration_parakeet.py +4 -6
  338. transformers/models/parakeet/modeling_parakeet.py +9 -6
  339. transformers/models/parakeet/modular_parakeet.py +2 -2
  340. transformers/models/parakeet/processing_parakeet.py +1 -0
  341. transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
  342. transformers/models/patchtst/modeling_patchtst.py +20 -2
  343. transformers/models/pegasus/modeling_pegasus.py +5 -0
  344. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  345. transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
  346. transformers/models/perceiver/modeling_perceiver.py +8 -0
  347. transformers/models/persimmon/modeling_persimmon.py +2 -1
  348. transformers/models/phi/modeling_phi.py +4 -5
  349. transformers/models/phi/modular_phi.py +0 -1
  350. transformers/models/phi3/modeling_phi3.py +2 -1
  351. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
  352. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
  353. transformers/models/phimoe/modeling_phimoe.py +4 -4
  354. transformers/models/phimoe/modular_phimoe.py +2 -2
  355. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  356. transformers/models/pixtral/modeling_pixtral.py +2 -1
  357. transformers/models/plbart/modeling_plbart.py +6 -0
  358. transformers/models/plbart/modular_plbart.py +2 -0
  359. transformers/models/plbart/tokenization_plbart.py +0 -2
  360. transformers/models/poolformer/modeling_poolformer.py +2 -0
  361. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  362. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  363. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  364. transformers/models/prophetnet/modeling_prophetnet.py +3 -0
  365. transformers/models/pvt/modeling_pvt.py +2 -0
  366. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  367. transformers/models/qwen2/modeling_qwen2.py +4 -4
  368. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  369. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  370. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
  371. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
  372. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  373. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
  374. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
  375. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
  376. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  377. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  378. transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
  379. transformers/models/qwen3/modeling_qwen3.py +4 -4
  380. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  381. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
  382. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
  383. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
  384. transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
  385. transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
  386. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
  387. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
  388. transformers/models/rag/modeling_rag.py +1 -0
  389. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
  390. transformers/models/reformer/modeling_reformer.py +4 -0
  391. transformers/models/reformer/tokenization_reformer.py +11 -28
  392. transformers/models/regnet/modeling_regnet.py +6 -1
  393. transformers/models/rembert/modeling_rembert.py +6 -0
  394. transformers/models/rembert/tokenization_rembert.py +3 -10
  395. transformers/models/resnet/modeling_resnet.py +11 -2
  396. transformers/models/roberta/tokenization_roberta.py +18 -27
  397. transformers/models/roformer/modeling_roformer.py +6 -0
  398. transformers/models/roformer/tokenization_roformer.py +77 -412
  399. transformers/models/rt_detr/modeling_rt_detr.py +2 -0
  400. transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
  401. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
  402. transformers/models/rwkv/modeling_rwkv.py +1 -0
  403. transformers/models/sam2/modeling_sam2.py +2 -2
  404. transformers/models/sam2/modular_sam2.py +2 -2
  405. transformers/models/sam2_video/modeling_sam2_video.py +1 -0
  406. transformers/models/sam2_video/modular_sam2_video.py +1 -0
  407. transformers/models/sam3/modeling_sam3.py +77 -80
  408. transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
  409. transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
  410. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
  411. transformers/models/sam3_video/modeling_sam3_video.py +1 -0
  412. transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
  413. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  414. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
  415. transformers/models/seed_oss/modeling_seed_oss.py +2 -2
  416. transformers/models/segformer/modeling_segformer.py +4 -1
  417. transformers/models/seggpt/modeling_seggpt.py +2 -0
  418. transformers/models/sew/modeling_sew.py +3 -0
  419. transformers/models/sew/modular_sew.py +1 -0
  420. transformers/models/sew_d/modeling_sew_d.py +3 -0
  421. transformers/models/siglip2/modeling_siglip2.py +4 -0
  422. transformers/models/siglip2/modular_siglip2.py +4 -0
  423. transformers/models/smollm3/modeling_smollm3.py +4 -4
  424. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  425. transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
  426. transformers/models/speecht5/modeling_speecht5.py +13 -1
  427. transformers/models/splinter/modeling_splinter.py +3 -0
  428. transformers/models/splinter/tokenization_splinter.py +9 -28
  429. transformers/models/squeezebert/modeling_squeezebert.py +6 -0
  430. transformers/models/stablelm/modeling_stablelm.py +3 -1
  431. transformers/models/starcoder2/modeling_starcoder2.py +4 -3
  432. transformers/models/superglue/modeling_superglue.py +1 -0
  433. transformers/models/superpoint/modeling_superpoint.py +1 -0
  434. transformers/models/swiftformer/modeling_swiftformer.py +2 -0
  435. transformers/models/swin/modeling_swin.py +4 -0
  436. transformers/models/swin2sr/modeling_swin2sr.py +2 -0
  437. transformers/models/swinv2/modeling_swinv2.py +4 -0
  438. transformers/models/t5/modeling_t5.py +7 -0
  439. transformers/models/t5/tokenization_t5.py +4 -8
  440. transformers/models/t5gemma/modeling_t5gemma.py +5 -5
  441. transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
  442. transformers/models/table_transformer/modeling_table_transformer.py +4 -0
  443. transformers/models/tapas/modeling_tapas.py +3 -0
  444. transformers/models/textnet/modeling_textnet.py +11 -2
  445. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  446. transformers/models/timesfm/modeling_timesfm.py +2 -0
  447. transformers/models/timesfm/modular_timesfm.py +2 -0
  448. transformers/models/timesformer/modeling_timesformer.py +2 -0
  449. transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
  450. transformers/models/trocr/modeling_trocr.py +2 -0
  451. transformers/models/tvp/modeling_tvp.py +2 -0
  452. transformers/models/udop/modeling_udop.py +4 -0
  453. transformers/models/udop/tokenization_udop.py +5 -13
  454. transformers/models/umt5/modeling_umt5.py +7 -0
  455. transformers/models/unispeech/modeling_unispeech.py +4 -0
  456. transformers/models/unispeech/modular_unispeech.py +2 -0
  457. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  458. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  459. transformers/models/univnet/modeling_univnet.py +1 -0
  460. transformers/models/upernet/modeling_upernet.py +1 -0
  461. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  462. transformers/models/vilt/modeling_vilt.py +6 -0
  463. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  464. transformers/models/visual_bert/modeling_visual_bert.py +6 -0
  465. transformers/models/vitdet/modeling_vitdet.py +2 -0
  466. transformers/models/vitmatte/modeling_vitmatte.py +1 -0
  467. transformers/models/vits/modeling_vits.py +1 -0
  468. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  469. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  470. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
  471. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
  472. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
  473. transformers/models/wavlm/modeling_wavlm.py +5 -0
  474. transformers/models/whisper/modeling_whisper.py +6 -0
  475. transformers/models/whisper/tokenization_whisper.py +4 -15
  476. transformers/models/x_clip/modeling_x_clip.py +3 -0
  477. transformers/models/xglm/modeling_xglm.py +1 -0
  478. transformers/models/xglm/tokenization_xglm.py +4 -9
  479. transformers/models/xlm/modeling_xlm.py +5 -0
  480. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  481. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  482. transformers/models/yoso/modeling_yoso.py +6 -0
  483. transformers/models/zamba/modeling_zamba.py +2 -0
  484. transformers/models/zamba2/modeling_zamba2.py +4 -2
  485. transformers/models/zamba2/modular_zamba2.py +1 -1
  486. transformers/models/zoedepth/modeling_zoedepth.py +1 -0
  487. transformers/pipelines/__init__.py +2 -3
  488. transformers/pipelines/base.py +1 -9
  489. transformers/pipelines/document_question_answering.py +3 -1
  490. transformers/pipelines/text_generation.py +1 -1
  491. transformers/processing_utils.py +23 -11
  492. transformers/quantizers/base.py +35 -110
  493. transformers/quantizers/quantizer_aqlm.py +1 -5
  494. transformers/quantizers/quantizer_auto_round.py +1 -2
  495. transformers/quantizers/quantizer_awq.py +17 -81
  496. transformers/quantizers/quantizer_bitnet.py +3 -8
  497. transformers/quantizers/quantizer_bnb_4bit.py +13 -110
  498. transformers/quantizers/quantizer_bnb_8bit.py +16 -92
  499. transformers/quantizers/quantizer_compressed_tensors.py +1 -5
  500. transformers/quantizers/quantizer_eetq.py +14 -62
  501. transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
  502. transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
  503. transformers/quantizers/quantizer_fp_quant.py +48 -78
  504. transformers/quantizers/quantizer_gptq.py +7 -24
  505. transformers/quantizers/quantizer_higgs.py +40 -54
  506. transformers/quantizers/quantizer_hqq.py +144 -153
  507. transformers/quantizers/quantizer_mxfp4.py +13 -167
  508. transformers/quantizers/quantizer_quanto.py +20 -64
  509. transformers/quantizers/quantizer_quark.py +36 -17
  510. transformers/quantizers/quantizer_spqr.py +1 -4
  511. transformers/quantizers/quantizer_torchao.py +23 -202
  512. transformers/quantizers/quantizer_vptq.py +8 -22
  513. transformers/quantizers/quantizers_utils.py +20 -0
  514. transformers/testing_utils.py +297 -36
  515. transformers/tokenization_mistral_common.py +4 -0
  516. transformers/tokenization_utils_base.py +113 -222
  517. transformers/tokenization_utils_tokenizers.py +168 -107
  518. transformers/trainer.py +28 -31
  519. transformers/trainer_jit_checkpoint.py +126 -0
  520. transformers/trainer_utils.py +1 -1
  521. transformers/training_args.py +66 -28
  522. transformers/utils/__init__.py +3 -4
  523. transformers/utils/auto_docstring.py +1 -0
  524. transformers/utils/generic.py +27 -1
  525. transformers/utils/hub.py +5 -15
  526. transformers/utils/import_utils.py +61 -16
  527. transformers/utils/kernel_config.py +4 -2
  528. transformers/utils/loading_report.py +19 -10
  529. transformers/utils/quantization_config.py +75 -242
  530. transformers/video_processing_utils.py +1 -2
  531. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
  532. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
  533. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
  534. transformers/kernels/__init__.py +0 -0
  535. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  536. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  537. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
  538. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  539. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -903,6 +903,7 @@ class AutoformerEncoder(AutoformerPreTrainedModel):
903
903
  output_attentions: Optional[bool] = None,
904
904
  output_hidden_states: Optional[bool] = None,
905
905
  return_dict: Optional[bool] = None,
906
+ **kwargs,
906
907
  ) -> Union[tuple, BaseModelOutput]:
907
908
  r"""
908
909
  Args:
@@ -1024,6 +1025,7 @@ class AutoformerDecoder(AutoformerPreTrainedModel):
1024
1025
  output_hidden_states: Optional[bool] = None,
1025
1026
  return_dict: Optional[bool] = None,
1026
1027
  cache_position: Optional[torch.Tensor] = None,
1028
+ **kwargs,
1027
1029
  ) -> Union[tuple, AutoFormerDecoderOutput]:
1028
1030
  r"""
1029
1031
  Args:
@@ -1360,6 +1362,7 @@ class AutoformerModel(AutoformerPreTrainedModel):
1360
1362
  use_cache: Optional[bool] = None,
1361
1363
  return_dict: Optional[bool] = None,
1362
1364
  cache_position: Optional[torch.Tensor] = None,
1365
+ **kwargs,
1363
1366
  ) -> Union[AutoformerModelOutput, tuple]:
1364
1367
  r"""
1365
1368
  past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -1610,6 +1613,7 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
1610
1613
  output_attentions: Optional[bool] = None,
1611
1614
  use_cache: Optional[bool] = None,
1612
1615
  return_dict: Optional[bool] = None,
1616
+ **kwargs,
1613
1617
  ) -> Union[Seq2SeqTSPredictionOutput, tuple]:
1614
1618
  r"""
1615
1619
  past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -35,7 +35,7 @@ from transformers.activations import ACT2FN
35
35
  from ... import initialization as init
36
36
  from ...cache_utils import Cache
37
37
  from ...generation import GenerationMixin
38
- from ...integrations import use_kernel_forward_from_hub
38
+ from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
39
39
  from ...modeling_attn_mask_utils import AttentionMaskConverter
40
40
  from ...modeling_layers import GradientCheckpointingLayer
41
41
  from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -43,6 +43,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
43
43
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
44
44
  from ...processing_utils import Unpack
45
45
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
46
+ from ...utils.generic import maybe_autocast
46
47
  from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
47
48
  from .configuration_bamba import BambaConfig
48
49
 
@@ -250,7 +251,7 @@ class BambaRotaryEmbedding(nn.Module):
250
251
  position_ids_expanded = position_ids[:, None, :].float()
251
252
 
252
253
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
253
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
254
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
254
255
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
255
256
  emb = torch.cat((freqs, freqs), dim=-1)
256
257
  cos = emb.cos() * self.attention_scaling
@@ -345,6 +346,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
345
346
  return q_embed, k_embed
346
347
 
347
348
 
349
+ @use_kernelized_func(apply_rotary_pos_emb)
348
350
  class BambaAttention(nn.Module):
349
351
  """Multi-headed attention from 'Attention Is All You Need' paper"""
350
352
 
@@ -370,7 +372,6 @@ class BambaAttention(nn.Module):
370
372
  self.o_proj = nn.Linear(
371
373
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
372
374
  )
373
- self.rotary_fn = apply_rotary_pos_emb
374
375
 
375
376
  def forward(
376
377
  self,
@@ -426,6 +426,7 @@ class BarkCausalModel(BarkPreTrainedModel, GenerationMixin):
426
426
  output_hidden_states: Optional[bool] = None,
427
427
  return_dict: Optional[bool] = None,
428
428
  cache_position: Optional[torch.Tensor] = None,
429
+ **kwargs,
429
430
  ) -> Union[tuple[torch.Tensor], CausalLMOutputWithPast]:
430
431
  r"""
431
432
  input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
@@ -1028,6 +1029,7 @@ class BarkFineModel(BarkPreTrainedModel):
1028
1029
  output_attentions: Optional[bool] = None,
1029
1030
  output_hidden_states: Optional[bool] = None,
1030
1031
  return_dict: Optional[bool] = None,
1032
+ **kwargs,
1031
1033
  ) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
1032
1034
  r"""
1033
1035
  codebook_idx (`int`):
@@ -547,6 +547,7 @@ class BartEncoder(BartPreTrainedModel):
547
547
  output_attentions: Optional[bool] = None,
548
548
  output_hidden_states: Optional[bool] = None,
549
549
  return_dict: Optional[bool] = None,
550
+ **kwargs,
550
551
  ) -> Union[tuple, BaseModelOutput]:
551
552
  r"""
552
553
  Args:
@@ -694,6 +695,7 @@ class BartDecoder(BartPreTrainedModel):
694
695
  output_hidden_states: Optional[bool] = None,
695
696
  return_dict: Optional[bool] = None,
696
697
  cache_position: Optional[torch.LongTensor] = None,
698
+ **kwargs,
697
699
  ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
698
700
  r"""
699
701
  Args:
@@ -921,6 +923,7 @@ class BartModel(BartPreTrainedModel):
921
923
  output_hidden_states: Optional[bool] = None,
922
924
  return_dict: Optional[bool] = None,
923
925
  cache_position: Optional[torch.LongTensor] = None,
926
+ **kwargs,
924
927
  ) -> Union[tuple, Seq2SeqModelOutput]:
925
928
  r"""
926
929
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1067,6 +1070,7 @@ class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin):
1067
1070
  output_hidden_states: Optional[bool] = None,
1068
1071
  return_dict: Optional[bool] = None,
1069
1072
  cache_position: Optional[torch.LongTensor] = None,
1073
+ **kwargs,
1070
1074
  ) -> Union[tuple, Seq2SeqLMOutput]:
1071
1075
  r"""
1072
1076
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1228,6 +1232,7 @@ class BartForSequenceClassification(BartPreTrainedModel):
1228
1232
  output_hidden_states: Optional[bool] = None,
1229
1233
  return_dict: Optional[bool] = None,
1230
1234
  cache_position: Optional[torch.LongTensor] = None,
1235
+ **kwargs,
1231
1236
  ) -> Union[tuple, Seq2SeqSequenceClassifierOutput]:
1232
1237
  r"""
1233
1238
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1360,6 +1365,7 @@ class BartForQuestionAnswering(BartPreTrainedModel):
1360
1365
  output_hidden_states: Optional[bool] = None,
1361
1366
  return_dict: Optional[bool] = None,
1362
1367
  cache_position: Optional[torch.LongTensor] = None,
1368
+ **kwargs,
1363
1369
  ) -> Union[tuple, Seq2SeqQuestionAnsweringModelOutput]:
1364
1370
  r"""
1365
1371
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1505,6 +1511,7 @@ class BartForCausalLM(BartPreTrainedModel, GenerationMixin):
1505
1511
  return_dict: Optional[bool] = None,
1506
1512
  cache_position: Optional[torch.LongTensor] = None,
1507
1513
  logits_to_keep: Union[int, torch.Tensor] = 0,
1514
+ **kwargs,
1508
1515
  ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
1509
1516
  r"""
1510
1517
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -14,6 +14,8 @@
14
14
  # limitations under the License
15
15
  """Tokenization classes for the BARThez model."""
16
16
 
17
+ from typing import Optional, Union
18
+
17
19
  from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers
18
20
  from tokenizers.models import Unigram
19
21
 
@@ -77,7 +79,7 @@ class BarthezTokenizer(TokenizersBackend):
77
79
  vocab_file (`str`, *optional*):
78
80
  [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
79
81
  contains the vocabulary necessary to instantiate a tokenizer.
80
- vocab (`dict`, *optional*):
82
+ vocab (`str`, `dict` or `list`, *optional*):
81
83
  Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
82
84
  add_prefix_space (`bool`, *optional*, defaults to `True`):
83
85
  Whether or not to add an initial space to the input. This allows to treat the leading word just as any
@@ -90,6 +92,7 @@ class BarthezTokenizer(TokenizersBackend):
90
92
 
91
93
  def __init__(
92
94
  self,
95
+ vocab: Optional[Union[str, dict, list]] = None,
93
96
  bos_token="<s>",
94
97
  eos_token="</s>",
95
98
  sep_token="</s>",
@@ -97,15 +100,12 @@ class BarthezTokenizer(TokenizersBackend):
97
100
  unk_token="<unk>",
98
101
  pad_token="<pad>",
99
102
  mask_token="<mask>",
100
- vocab_file=None,
101
- vocab=None,
102
103
  add_prefix_space=True,
103
104
  **kwargs,
104
105
  ):
105
106
  # Mask token behave like a normal word, i.e. include the space before it
106
107
  mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
107
108
  self.add_prefix_space = add_prefix_space
108
- self.vocab_file = vocab_file
109
109
 
110
110
  if vocab is not None:
111
111
  self._vocab = vocab
@@ -122,10 +122,7 @@ class BarthezTokenizer(TokenizersBackend):
122
122
 
123
123
  self._tokenizer.normalizer = normalizers.Sequence(
124
124
  [
125
- normalizers.Replace("\n", " "),
126
- normalizers.Replace("\r", " "),
127
- normalizers.Replace("\t", " "),
128
- normalizers.Replace(Regex(r" {2,}"), " "),
125
+ normalizers.Replace(Regex(r"\s{2,}|[\n\r\t]"), " "),
129
126
  normalizers.NFC(),
130
127
  normalizers.Strip(left=False, right=True),
131
128
  ]
@@ -134,9 +131,7 @@ class BarthezTokenizer(TokenizersBackend):
134
131
  self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
135
132
  self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
136
133
 
137
- tokenizer_object = self._tokenizer
138
134
  super().__init__(
139
- tokenizer_object=tokenizer_object,
140
135
  bos_token=bos_token,
141
136
  eos_token=eos_token,
142
137
  unk_token=unk_token,
@@ -216,7 +216,7 @@ class BeitPatchEmbeddings(nn.Module):
216
216
  "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
217
217
  )
218
218
 
219
- embeddings = self.projection(pixel_values)
219
+ embeddings = self.projection(pixel_values.to(self.projection.weight.dtype))
220
220
  patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
221
221
  embeddings = embeddings.flatten(2).transpose(1, 2)
222
222
 
@@ -726,6 +726,7 @@ class BeitModel(BeitPreTrainedModel):
726
726
  output_hidden_states: Optional[bool] = None,
727
727
  interpolate_pos_encoding: bool = False,
728
728
  return_dict: Optional[bool] = None,
729
+ **kwargs,
729
730
  ) -> Union[tuple, BeitModelOutputWithPooling]:
730
731
  r"""
731
732
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
@@ -818,6 +819,7 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel):
818
819
  output_hidden_states: Optional[bool] = None,
819
820
  interpolate_pos_encoding: bool = False,
820
821
  return_dict: Optional[bool] = None,
822
+ **kwargs,
821
823
  ) -> Union[tuple, MaskedLMOutput]:
822
824
  r"""
823
825
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
@@ -911,6 +913,7 @@ class BeitForImageClassification(BeitPreTrainedModel):
911
913
  output_hidden_states: Optional[bool] = None,
912
914
  interpolate_pos_encoding: bool = False,
913
915
  return_dict: Optional[bool] = None,
916
+ **kwargs,
914
917
  ) -> Union[tuple, ImageClassifierOutput]:
915
918
  r"""
916
919
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1244,6 +1247,7 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
1244
1247
  output_hidden_states: Optional[bool] = None,
1245
1248
  interpolate_pos_encoding: bool = False,
1246
1249
  return_dict: Optional[bool] = None,
1250
+ **kwargs,
1247
1251
  ) -> Union[tuple, SemanticSegmenterOutput]:
1248
1252
  r"""
1249
1253
  labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
@@ -1371,6 +1375,7 @@ class BeitBackbone(BeitPreTrainedModel, BackboneMixin):
1371
1375
  output_hidden_states: Optional[bool] = None,
1372
1376
  output_attentions: Optional[bool] = None,
1373
1377
  return_dict: Optional[bool] = None,
1378
+ **kwargs,
1374
1379
  ) -> BackboneOutput:
1375
1380
  r"""
1376
1381
  Examples:
@@ -15,7 +15,7 @@
15
15
  """Tokenization classes for Bert."""
16
16
 
17
17
  import collections
18
- from typing import Optional
18
+ from typing import Optional, Union
19
19
 
20
20
  from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
21
21
  from tokenizers.models import WordPiece
@@ -48,8 +48,8 @@ class BertTokenizer(TokenizersBackend):
48
48
  this superclass for more information regarding those methods.
49
49
 
50
50
  Args:
51
- vocab_file (`str`, *optional*):
52
- File containing the vocabulary.
51
+ vocab (`str` or `dict[str, int]`, *optional*):
52
+ Custom vocabulary dictionary. If not provided, vocabulary is loaded from `vocab_file`.
53
53
  do_lower_case (`bool`, *optional*, defaults to `False`):
54
54
  Whether or not to lowercase the input when tokenizing.
55
55
  unk_token (`str`, *optional*, defaults to `"[UNK]"`):
@@ -72,17 +72,15 @@ class BertTokenizer(TokenizersBackend):
72
72
  strip_accents (`bool`, *optional*):
73
73
  Whether or not to strip all accents. If this option is not specified, then it will be determined by the
74
74
  value for `lowercase` (as in the original BERT).
75
- vocab (`dict`, *optional*):
76
- Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
77
75
  """
78
76
 
79
77
  vocab_files_names = VOCAB_FILES_NAMES
80
78
  model_input_names = ["input_ids", "token_type_ids", "attention_mask"]
81
- slow_tokenizer_class = None
79
+ model = WordPiece
82
80
 
83
81
  def __init__(
84
82
  self,
85
- vocab_file: Optional[str] = None,
83
+ vocab: Optional[Union[str, dict[str, int]]] = None,
86
84
  do_lower_case: bool = False,
87
85
  unk_token: str = "[UNK]",
88
86
  sep_token: str = "[SEP]",
@@ -91,28 +89,21 @@ class BertTokenizer(TokenizersBackend):
91
89
  mask_token: str = "[MASK]",
92
90
  tokenize_chinese_chars: bool = True,
93
91
  strip_accents: Optional[bool] = None,
94
- vocab: Optional[dict] = None,
95
92
  **kwargs,
96
93
  ):
97
94
  self.do_lower_case = do_lower_case
98
95
  self.tokenize_chinese_chars = tokenize_chinese_chars
99
96
  self.strip_accents = strip_accents
100
-
101
- if vocab is not None:
102
- self._vocab = (
103
- {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
104
- )
105
- else:
106
- self._vocab = {
97
+ if vocab is None:
98
+ vocab = {
107
99
  str(pad_token): 0,
108
100
  str(unk_token): 1,
109
101
  str(cls_token): 2,
110
102
  str(sep_token): 3,
111
103
  str(mask_token): 4,
112
104
  }
113
-
105
+ self._vocab = vocab
114
106
  self._tokenizer = Tokenizer(WordPiece(self._vocab, unk_token=str(unk_token)))
115
-
116
107
  self._tokenizer.normalizer = normalizers.BertNormalizer(
117
108
  clean_text=True,
118
109
  handle_chinese_chars=tokenize_chinese_chars,
@@ -121,11 +112,7 @@ class BertTokenizer(TokenizersBackend):
121
112
  )
122
113
  self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
123
114
  self._tokenizer.decoder = decoders.WordPiece(prefix="##")
124
-
125
- tokenizer_object = self._tokenizer
126
-
127
115
  super().__init__(
128
- tokenizer_object=tokenizer_object,
129
116
  do_lower_case=do_lower_case,
130
117
  unk_token=unk_token,
131
118
  sep_token=sep_token,
@@ -1918,6 +1918,7 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
1918
1918
  output_attentions: Optional[bool] = None,
1919
1919
  output_hidden_states: Optional[bool] = None,
1920
1920
  return_dict: Optional[bool] = None,
1921
+ **kwargs,
1921
1922
  ) -> Union[BigBirdForPreTrainingOutput, tuple[torch.FloatTensor]]:
1922
1923
  r"""
1923
1924
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -2028,6 +2029,7 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
2028
2029
  output_attentions: Optional[bool] = None,
2029
2030
  output_hidden_states: Optional[bool] = None,
2030
2031
  return_dict: Optional[bool] = None,
2032
+ **kwargs,
2031
2033
  ) -> Union[MaskedLMOutput, tuple[torch.FloatTensor]]:
2032
2034
  r"""
2033
2035
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -2277,6 +2279,7 @@ class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
2277
2279
  output_attentions: Optional[bool] = None,
2278
2280
  output_hidden_states: Optional[bool] = None,
2279
2281
  return_dict: Optional[bool] = None,
2282
+ **kwargs,
2280
2283
  ) -> Union[SequenceClassifierOutput, tuple[torch.FloatTensor]]:
2281
2284
  r"""
2282
2285
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -2394,6 +2397,7 @@ class BigBirdForMultipleChoice(BigBirdPreTrainedModel):
2394
2397
  output_attentions: Optional[bool] = None,
2395
2398
  output_hidden_states: Optional[bool] = None,
2396
2399
  return_dict: Optional[bool] = None,
2400
+ **kwargs,
2397
2401
  ) -> Union[MultipleChoiceModelOutput, tuple[torch.FloatTensor]]:
2398
2402
  r"""
2399
2403
  input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
@@ -2500,6 +2504,7 @@ class BigBirdForTokenClassification(BigBirdPreTrainedModel):
2500
2504
  output_attentions: Optional[bool] = None,
2501
2505
  output_hidden_states: Optional[bool] = None,
2502
2506
  return_dict: Optional[bool] = None,
2507
+ **kwargs,
2503
2508
  ) -> Union[TokenClassifierOutput, tuple[torch.FloatTensor]]:
2504
2509
  r"""
2505
2510
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -2591,6 +2596,7 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
2591
2596
  output_attentions: Optional[bool] = None,
2592
2597
  output_hidden_states: Optional[bool] = None,
2593
2598
  return_dict: Optional[bool] = None,
2599
+ **kwargs,
2594
2600
  ) -> Union[BigBirdForQuestionAnsweringModelOutput, tuple[torch.FloatTensor]]:
2595
2601
  r"""
2596
2602
  question_lengths (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
@@ -14,6 +14,8 @@
14
14
  # limitations under the License.
15
15
  """Tokenization classes for Big Bird model."""
16
16
 
17
+ from typing import Optional, Union
18
+
17
19
  from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
18
20
  from tokenizers.models import Unigram
19
21
 
@@ -37,7 +39,7 @@ class BigBirdTokenizer(TokenizersBackend):
37
39
  this superclass for more information regarding those methods
38
40
 
39
41
  Args:
40
- vocab (`dict`, *optional*):
42
+ vocab (`str`, `dict` or `list`, *optional*):
41
43
  Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
42
44
  unk_token (`str`, *optional*, defaults to `"<unk>"`):
43
45
  The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
@@ -80,10 +82,11 @@ class BigBirdTokenizer(TokenizersBackend):
80
82
  vocab_files_names = VOCAB_FILES_NAMES
81
83
  model_input_names = ["input_ids", "attention_mask"]
82
84
  prefix_tokens: list[int] = []
85
+ model = Unigram
83
86
 
84
87
  def __init__(
85
88
  self,
86
- vocab=None,
89
+ vocab: Optional[Union[str, dict, list]] = None,
87
90
  unk_token="<unk>",
88
91
  bos_token="<s>",
89
92
  eos_token="</s>",
@@ -92,8 +95,6 @@ class BigBirdTokenizer(TokenizersBackend):
92
95
  mask_token="[MASK]",
93
96
  cls_token="[CLS]",
94
97
  add_prefix_space=True,
95
- vocab_file=None,
96
- tokenizer_file=None,
97
98
  **kwargs,
98
99
  ):
99
100
  bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
@@ -105,47 +106,18 @@ class BigBirdTokenizer(TokenizersBackend):
105
106
  mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
106
107
 
107
108
  self.add_prefix_space = add_prefix_space
108
- self.vocab_file = vocab_file
109
109
 
110
110
  # Convert vocab to list of (token, score) tuples
111
111
  if vocab is None:
112
- vocab_scores = [(str(pad_token), 0.0), (str(eos_token), 0.0), (str(bos_token), 0.0)]
113
- elif isinstance(vocab, dict):
114
- vocab_scores = [(str(token), float(score)) for token, score in vocab.items()]
115
- elif isinstance(vocab, list) and len(vocab) > 0:
116
- if isinstance(vocab[0], (tuple, list)):
117
- vocab_scores = [(str(token), float(score)) for token, score in vocab]
118
- else:
119
- vocab_scores = [(str(token), 0.0) for token in vocab]
120
- else:
121
- vocab_scores = [(str(pad_token), 0.0), (str(eos_token), 0.0), (str(bos_token), 0.0)]
122
-
123
- # Find unk_id in vocab
124
- unk_token_content = str(unk_token)
125
- unk_id = next((idx for idx, (token, _) in enumerate(vocab_scores) if token == unk_token_content), None)
126
- if unk_id is None:
127
- unk_id = min(len(vocab_scores), 100)
128
- if len(vocab_scores) > 100:
129
- vocab_scores.insert(100, (unk_token_content, 0.0))
130
- else:
131
- vocab_scores.append((unk_token_content, 0.0))
132
-
133
- # Ensure cls_token and sep_token are in vocab
134
- cls_token_str = str(cls_token)
135
- sep_token_str = str(sep_token)
136
- cls_token_id = next((idx for idx, (token, _) in enumerate(vocab_scores) if token == cls_token_str), None)
137
- sep_token_id = next((idx for idx, (token, _) in enumerate(vocab_scores) if token == sep_token_str), None)
112
+ vocab = [(str(pad_token), 0.0), (str(eos_token), 0.0), (str(bos_token), 0.0), (str(unk_token), 0.0)]
113
+ unk_id = 3
114
+ elif isinstance(vocab, list):
115
+ # vocab.insert(100, (str(unk_token), 0.0)) # Ensure unk_token is in vocab at index 100
116
+ unk_id = vocab.index((str(unk_token), 0.0)) if (str(unk_token), 0.0) in vocab else 100
138
117
 
139
- if cls_token_id is None:
140
- cls_token_id = len(vocab_scores)
141
- vocab_scores.append((cls_token_str, 0.0))
142
- if sep_token_id is None:
143
- sep_token_id = len(vocab_scores)
144
- vocab_scores.append((sep_token_str, 0.0))
145
-
146
- self._tokenizer = Tokenizer(Unigram(vocab_scores, unk_id=unk_id, byte_fallback=False))
118
+ self._tokenizer = Tokenizer(Unigram(vocab, unk_id=unk_id, byte_fallback=False))
147
119
  self._tokenizer.normalizer = normalizers.Sequence(
148
- [normalizers.Strip(left=False, right=True), normalizers.Replace(Regex(r" {2,}"), SPIECE_UNDERLINE)]
120
+ [normalizers.Strip(left=False, right=False), normalizers.Replace(Regex(r" {2,}"), SPIECE_UNDERLINE)]
149
121
  )
150
122
 
151
123
  prepend_scheme = "always" if add_prefix_space else "never"
@@ -155,7 +127,6 @@ class BigBirdTokenizer(TokenizersBackend):
155
127
  self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme, split=True)
156
128
 
157
129
  super().__init__(
158
- tokenizer_object=self._tokenizer,
159
130
  bos_token=bos_token,
160
131
  eos_token=eos_token,
161
132
  unk_token=unk_token,
@@ -163,10 +134,15 @@ class BigBirdTokenizer(TokenizersBackend):
163
134
  mask_token=mask_token,
164
135
  cls_token=cls_token,
165
136
  sep_token=sep_token,
137
+ add_prefix_space=add_prefix_space,
166
138
  **kwargs,
167
139
  )
168
140
 
169
- self.init_kwargs["add_prefix_space"] = add_prefix_space
141
+ # Ensure cls_token and sep_token are in vocab
142
+ cls_token_str = str(cls_token)
143
+ sep_token_str = str(sep_token)
144
+ cls_token_id = self.cls_token_id
145
+ sep_token_id = self.sep_token_id
170
146
 
171
147
  self._tokenizer.post_processor = processors.TemplateProcessing(
172
148
  single=f"{cls_token_str}:0 $A:0 {sep_token_str}:0",
@@ -1154,7 +1154,6 @@ class BigBirdPegasusEncoderAttention(nn.Module):
1154
1154
  return outputs
1155
1155
 
1156
1156
 
1157
- # Copied from transformers.models.bert.modeling_bert.eager_attention_forward
1158
1157
  def eager_attention_forward(
1159
1158
  module: nn.Module,
1160
1159
  query: torch.Tensor,
@@ -1178,7 +1177,7 @@ def eager_attention_forward(
1178
1177
  attn_weights = nn.functional.softmax(attn_weights, dim=-1)
1179
1178
  attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
1180
1179
 
1181
- attn_output = torch.matmul(attn_weights, value)
1180
+ attn_output = torch.matmul(attn_weights.to(value.dtype), value)
1182
1181
  attn_output = attn_output.transpose(1, 2).contiguous()
1183
1182
 
1184
1183
  return attn_output, attn_weights
@@ -1595,6 +1594,7 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
1595
1594
  output_attentions: Optional[bool] = None,
1596
1595
  output_hidden_states: Optional[bool] = None,
1597
1596
  return_dict: Optional[bool] = None,
1597
+ **kwargs,
1598
1598
  ):
1599
1599
  r"""
1600
1600
  Args:
@@ -1868,6 +1868,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
1868
1868
  output_hidden_states: Optional[bool] = None,
1869
1869
  return_dict: Optional[bool] = None,
1870
1870
  cache_position: Optional[torch.Tensor] = None,
1871
+ **kwargs,
1871
1872
  ):
1872
1873
  r"""
1873
1874
  Args:
@@ -2097,6 +2098,7 @@ class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel):
2097
2098
  output_hidden_states: Optional[bool] = None,
2098
2099
  return_dict: Optional[bool] = None,
2099
2100
  cache_position: Optional[torch.LongTensor] = None,
2101
+ **kwargs,
2100
2102
  ) -> Union[tuple, Seq2SeqModelOutput]:
2101
2103
  r"""
2102
2104
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -2235,6 +2237,7 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, Gene
2235
2237
  output_hidden_states: Optional[bool] = None,
2236
2238
  return_dict: Optional[bool] = None,
2237
2239
  cache_position: Optional[torch.LongTensor] = None,
2240
+ **kwargs,
2238
2241
  ) -> Union[tuple, Seq2SeqLMOutput]:
2239
2242
  r"""
2240
2243
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -2369,6 +2372,7 @@ class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel):
2369
2372
  output_hidden_states: Optional[bool] = None,
2370
2373
  return_dict: Optional[bool] = None,
2371
2374
  cache_position: Optional[torch.LongTensor] = None,
2375
+ **kwargs,
2372
2376
  ) -> Union[tuple, Seq2SeqSequenceClassifierOutput]:
2373
2377
  r"""
2374
2378
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -2490,6 +2494,7 @@ class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel):
2490
2494
  output_hidden_states: Optional[bool] = None,
2491
2495
  return_dict: Optional[bool] = None,
2492
2496
  cache_position: Optional[torch.LongTensor] = None,
2497
+ **kwargs,
2493
2498
  ) -> Union[tuple, Seq2SeqQuestionAnsweringModelOutput]:
2494
2499
  r"""
2495
2500
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -2616,6 +2621,7 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel, GenerationMixin):
2616
2621
  return_dict: Optional[bool] = None,
2617
2622
  cache_position: Optional[torch.LongTensor] = None,
2618
2623
  logits_to_keep: Union[int, torch.Tensor] = 0,
2624
+ **kwargs,
2619
2625
  ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
2620
2626
  r"""
2621
2627
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -620,6 +620,7 @@ class BioGptForTokenClassification(BioGptPreTrainedModel):
620
620
  output_hidden_states: Optional[bool] = None,
621
621
  return_dict: Optional[bool] = None,
622
622
  cache_position: Optional[torch.Tensor] = None,
623
+ **kwargs,
623
624
  ) -> Union[tuple, TokenClassifierOutput]:
624
625
  r"""
625
626
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -711,6 +712,7 @@ class BioGptForSequenceClassification(BioGptPreTrainedModel):
711
712
  return_dict: Optional[bool] = None,
712
713
  cache_position: Optional[torch.Tensor] = None,
713
714
  logits_to_keep: Union[int, torch.Tensor] = 0,
715
+ **kwargs,
714
716
  ) -> Union[tuple, SequenceClassifierOutputWithPast]:
715
717
  r"""
716
718
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -442,6 +442,7 @@ class BioGptForTokenClassification(BioGptPreTrainedModel):
442
442
  output_hidden_states: Optional[bool] = None,
443
443
  return_dict: Optional[bool] = None,
444
444
  cache_position: Optional[torch.Tensor] = None,
445
+ **kwargs,
445
446
  ) -> Union[tuple, TokenClassifierOutput]:
446
447
  r"""
447
448
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -533,6 +534,7 @@ class BioGptForSequenceClassification(BioGptPreTrainedModel):
533
534
  return_dict: Optional[bool] = None,
534
535
  cache_position: Optional[torch.Tensor] = None,
535
536
  logits_to_keep: Union[int, torch.Tensor] = 0,
537
+ **kwargs,
536
538
  ) -> Union[tuple, SequenceClassifierOutputWithPast]:
537
539
  r"""
538
540
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -666,7 +666,11 @@ class BitModel(BitPreTrainedModel):
666
666
 
667
667
  @auto_docstring
668
668
  def forward(
669
- self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
669
+ self,
670
+ pixel_values: Tensor,
671
+ output_hidden_states: Optional[bool] = None,
672
+ return_dict: Optional[bool] = None,
673
+ **kwargs,
670
674
  ) -> BaseModelOutputWithPoolingAndNoAttention:
671
675
  output_hidden_states = (
672
676
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -721,6 +725,7 @@ class BitForImageClassification(BitPreTrainedModel):
721
725
  labels: Optional[torch.LongTensor] = None,
722
726
  output_hidden_states: Optional[bool] = None,
723
727
  return_dict: Optional[bool] = None,
728
+ **kwargs,
724
729
  ) -> ImageClassifierOutputWithNoAttention:
725
730
  r"""
726
731
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -767,7 +772,11 @@ class BitBackbone(BitPreTrainedModel, BackboneMixin):
767
772
 
768
773
  @auto_docstring
769
774
  def forward(
770
- self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
775
+ self,
776
+ pixel_values: Tensor,
777
+ output_hidden_states: Optional[bool] = None,
778
+ return_dict: Optional[bool] = None,
779
+ **kwargs,
771
780
  ) -> BackboneOutput:
772
781
  r"""
773
782
  Examples: