transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. transformers/__init__.py +30 -3
  2. transformers/cli/serve.py +47 -17
  3. transformers/conversion_mapping.py +15 -2
  4. transformers/convert_slow_tokenizer.py +225 -10
  5. transformers/core_model_loading.py +196 -135
  6. transformers/data/data_collator.py +12 -4
  7. transformers/dependency_versions_table.py +1 -2
  8. transformers/dynamic_module_utils.py +1 -2
  9. transformers/feature_extraction_utils.py +1 -2
  10. transformers/file_utils.py +0 -1
  11. transformers/generation/__init__.py +11 -1
  12. transformers/generation/configuration_utils.py +3 -2
  13. transformers/generation/continuous_batching/__init__.py +4 -0
  14. transformers/generation/continuous_batching/continuous_api.py +134 -79
  15. transformers/image_processing_base.py +1 -2
  16. transformers/integrations/__init__.py +4 -2
  17. transformers/integrations/accelerate.py +15 -3
  18. transformers/integrations/aqlm.py +38 -66
  19. transformers/integrations/awq.py +48 -514
  20. transformers/integrations/bitnet.py +45 -100
  21. transformers/integrations/bitsandbytes.py +79 -191
  22. transformers/integrations/deepspeed.py +1 -0
  23. transformers/integrations/eetq.py +84 -79
  24. transformers/integrations/fbgemm_fp8.py +191 -145
  25. transformers/integrations/finegrained_fp8.py +236 -193
  26. transformers/integrations/fp_quant.py +92 -0
  27. transformers/integrations/ggml.py +11 -1
  28. transformers/integrations/higgs.py +40 -62
  29. transformers/integrations/hub_kernels.py +42 -3
  30. transformers/integrations/integration_utils.py +10 -0
  31. transformers/integrations/mxfp4.py +25 -65
  32. transformers/integrations/peft.py +7 -29
  33. transformers/integrations/quanto.py +73 -55
  34. transformers/integrations/quark.py +55 -0
  35. transformers/integrations/spqr.py +44 -90
  36. transformers/integrations/torchao.py +32 -38
  37. transformers/integrations/vptq.py +42 -59
  38. transformers/modelcard.py +1 -2
  39. transformers/modeling_gguf_pytorch_utils.py +8 -0
  40. transformers/modeling_rope_utils.py +30 -6
  41. transformers/modeling_utils.py +116 -112
  42. transformers/models/__init__.py +3 -0
  43. transformers/models/afmoe/modeling_afmoe.py +4 -4
  44. transformers/models/albert/tokenization_albert.py +6 -12
  45. transformers/models/align/modeling_align.py +2 -0
  46. transformers/models/altclip/modeling_altclip.py +4 -0
  47. transformers/models/apertus/modeling_apertus.py +4 -4
  48. transformers/models/arcee/modeling_arcee.py +4 -4
  49. transformers/models/aria/modeling_aria.py +4 -4
  50. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  51. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  52. transformers/models/auto/configuration_auto.py +11 -0
  53. transformers/models/auto/feature_extraction_auto.py +2 -0
  54. transformers/models/auto/image_processing_auto.py +1 -0
  55. transformers/models/auto/modeling_auto.py +6 -0
  56. transformers/models/auto/processing_auto.py +18 -10
  57. transformers/models/auto/tokenization_auto.py +74 -472
  58. transformers/models/autoformer/modeling_autoformer.py +4 -0
  59. transformers/models/bamba/modeling_bamba.py +4 -3
  60. transformers/models/bark/modeling_bark.py +2 -0
  61. transformers/models/bart/modeling_bart.py +7 -0
  62. transformers/models/barthez/tokenization_barthez.py +5 -10
  63. transformers/models/beit/modeling_beit.py +6 -1
  64. transformers/models/bert/tokenization_bert.py +8 -21
  65. transformers/models/big_bird/modeling_big_bird.py +6 -0
  66. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  67. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
  68. transformers/models/biogpt/modeling_biogpt.py +2 -0
  69. transformers/models/biogpt/modular_biogpt.py +2 -0
  70. transformers/models/bit/modeling_bit.py +11 -2
  71. transformers/models/bitnet/modeling_bitnet.py +4 -4
  72. transformers/models/blenderbot/modeling_blenderbot.py +5 -0
  73. transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
  74. transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
  75. transformers/models/blip/modeling_blip_text.py +2 -0
  76. transformers/models/blip_2/modeling_blip_2.py +2 -1
  77. transformers/models/bloom/modeling_bloom.py +4 -0
  78. transformers/models/blt/modeling_blt.py +2 -2
  79. transformers/models/blt/modular_blt.py +2 -2
  80. transformers/models/bridgetower/modeling_bridgetower.py +5 -1
  81. transformers/models/bros/modeling_bros.py +4 -0
  82. transformers/models/camembert/tokenization_camembert.py +8 -12
  83. transformers/models/canine/modeling_canine.py +5 -0
  84. transformers/models/chameleon/modeling_chameleon.py +2 -1
  85. transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
  86. transformers/models/clap/modeling_clap.py +5 -0
  87. transformers/models/clip/tokenization_clip.py +22 -44
  88. transformers/models/clipseg/modeling_clipseg.py +5 -0
  89. transformers/models/clvp/modeling_clvp.py +5 -0
  90. transformers/models/clvp/tokenization_clvp.py +1 -63
  91. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  92. transformers/models/codegen/tokenization_codegen.py +14 -43
  93. transformers/models/cohere/modeling_cohere.py +4 -3
  94. transformers/models/cohere/modular_cohere.py +2 -1
  95. transformers/models/cohere/tokenization_cohere.py +12 -42
  96. transformers/models/cohere2/modeling_cohere2.py +7 -6
  97. transformers/models/cohere2/modular_cohere2.py +5 -5
  98. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
  99. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  100. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  101. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  102. transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
  103. transformers/models/convbert/modeling_convbert.py +6 -0
  104. transformers/models/convnext/modeling_convnext.py +2 -4
  105. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  106. transformers/models/csm/modeling_csm.py +4 -3
  107. transformers/models/ctrl/modeling_ctrl.py +1 -0
  108. transformers/models/cvt/modeling_cvt.py +2 -0
  109. transformers/models/cwm/modeling_cwm.py +4 -4
  110. transformers/models/d_fine/modeling_d_fine.py +2 -0
  111. transformers/models/d_fine/modular_d_fine.py +1 -0
  112. transformers/models/dab_detr/modeling_dab_detr.py +4 -0
  113. transformers/models/dac/modeling_dac.py +2 -2
  114. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  115. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  116. transformers/models/dbrx/modeling_dbrx.py +2 -2
  117. transformers/models/deberta/modeling_deberta.py +5 -0
  118. transformers/models/deberta/tokenization_deberta.py +11 -20
  119. transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
  120. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  121. transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
  122. transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
  123. transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
  124. transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
  125. transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
  126. transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
  127. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  128. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  129. transformers/models/detr/modeling_detr.py +5 -0
  130. transformers/models/dia/modeling_dia.py +4 -3
  131. transformers/models/dia/modular_dia.py +0 -1
  132. transformers/models/diffllama/modeling_diffllama.py +2 -2
  133. transformers/models/dinat/modeling_dinat.py +3 -0
  134. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  135. transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
  136. transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
  137. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  138. transformers/models/doge/modeling_doge.py +2 -3
  139. transformers/models/doge/modular_doge.py +0 -1
  140. transformers/models/donut/modeling_donut_swin.py +2 -0
  141. transformers/models/dots1/modeling_dots1.py +10 -7
  142. transformers/models/dots1/modular_dots1.py +5 -3
  143. transformers/models/dpr/modeling_dpr.py +5 -0
  144. transformers/models/dpr/tokenization_dpr.py +12 -0
  145. transformers/models/edgetam/modeling_edgetam.py +1 -1
  146. transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
  147. transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
  148. transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
  149. transformers/models/efficientnet/modeling_efficientnet.py +2 -0
  150. transformers/models/emu3/modeling_emu3.py +4 -4
  151. transformers/models/eomt/image_processing_eomt.py +13 -1
  152. transformers/models/eomt/image_processing_eomt_fast.py +14 -2
  153. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  154. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  155. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
  156. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
  157. transformers/models/esm/modeling_esmfold.py +5 -4
  158. transformers/models/evolla/modeling_evolla.py +4 -4
  159. transformers/models/exaone4/modeling_exaone4.py +2 -2
  160. transformers/models/exaone4/modular_exaone4.py +0 -1
  161. transformers/models/falcon/modeling_falcon.py +6 -1
  162. transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
  163. transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
  164. transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
  165. transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
  166. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  167. transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
  168. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  169. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
  170. transformers/models/flaubert/modeling_flaubert.py +7 -0
  171. transformers/models/flava/modeling_flava.py +6 -1
  172. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
  173. transformers/models/florence2/modeling_florence2.py +2 -1
  174. transformers/models/florence2/modular_florence2.py +2 -1
  175. transformers/models/fnet/modeling_fnet.py +7 -0
  176. transformers/models/focalnet/modeling_focalnet.py +4 -0
  177. transformers/models/fsmt/modeling_fsmt.py +2 -0
  178. transformers/models/funnel/modeling_funnel.py +8 -0
  179. transformers/models/funnel/tokenization_funnel.py +17 -24
  180. transformers/models/fuyu/processing_fuyu.py +3 -3
  181. transformers/models/gemma/modeling_gemma.py +4 -4
  182. transformers/models/gemma/tokenization_gemma.py +10 -27
  183. transformers/models/gemma2/modeling_gemma2.py +4 -4
  184. transformers/models/gemma2/modular_gemma2.py +2 -1
  185. transformers/models/gemma3/modeling_gemma3.py +14 -84
  186. transformers/models/gemma3/modular_gemma3.py +12 -81
  187. transformers/models/gemma3n/modeling_gemma3n.py +18 -209
  188. transformers/models/gemma3n/modular_gemma3n.py +17 -59
  189. transformers/models/git/modeling_git.py +2 -0
  190. transformers/models/glm/modeling_glm.py +4 -4
  191. transformers/models/glm4/modeling_glm4.py +4 -4
  192. transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
  193. transformers/models/glm4v/configuration_glm4v.py +3 -1
  194. transformers/models/glm4v/modeling_glm4v.py +3 -3
  195. transformers/models/glm4v/modular_glm4v.py +6 -4
  196. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  197. transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
  198. transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
  199. transformers/models/glpn/modeling_glpn.py +2 -0
  200. transformers/models/gpt2/modeling_gpt2.py +5 -1
  201. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  202. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
  203. transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
  204. transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
  205. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  206. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  207. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
  208. transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
  209. transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
  210. transformers/models/gptj/modeling_gptj.py +3 -0
  211. transformers/models/granite/modeling_granite.py +4 -4
  212. transformers/models/granitemoe/modeling_granitemoe.py +4 -6
  213. transformers/models/granitemoe/modular_granitemoe.py +0 -2
  214. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
  215. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
  216. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
  217. transformers/models/groupvit/modeling_groupvit.py +3 -0
  218. transformers/models/helium/modeling_helium.py +4 -3
  219. transformers/models/herbert/tokenization_herbert.py +9 -25
  220. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
  221. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
  222. transformers/models/hiera/modeling_hiera.py +4 -0
  223. transformers/models/hubert/modeling_hubert.py +3 -0
  224. transformers/models/hubert/modular_hubert.py +1 -0
  225. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
  226. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
  227. transformers/models/ibert/modeling_ibert.py +6 -0
  228. transformers/models/idefics/modeling_idefics.py +5 -21
  229. transformers/models/imagegpt/modeling_imagegpt.py +2 -1
  230. transformers/models/informer/modeling_informer.py +4 -0
  231. transformers/models/informer/modular_informer.py +1 -0
  232. transformers/models/internvl/modeling_internvl.py +2 -4
  233. transformers/models/internvl/modular_internvl.py +2 -4
  234. transformers/models/jamba/modeling_jamba.py +2 -2
  235. transformers/models/janus/modeling_janus.py +1 -0
  236. transformers/models/janus/modular_janus.py +1 -0
  237. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  238. transformers/models/kosmos2/modeling_kosmos2.py +1 -0
  239. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
  240. transformers/models/lasr/__init__.py +29 -0
  241. transformers/models/lasr/configuration_lasr.py +244 -0
  242. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  243. transformers/models/lasr/modeling_lasr.py +729 -0
  244. transformers/models/lasr/modular_lasr.py +569 -0
  245. transformers/models/lasr/processing_lasr.py +96 -0
  246. transformers/models/lasr/tokenization_lasr.py +186 -0
  247. transformers/models/layoutlm/modeling_layoutlm.py +5 -0
  248. transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
  249. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
  250. transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
  251. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  252. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  253. transformers/models/led/modeling_led.py +6 -0
  254. transformers/models/levit/modeling_levit.py +3 -0
  255. transformers/models/lfm2/modeling_lfm2.py +4 -5
  256. transformers/models/lfm2/modular_lfm2.py +0 -1
  257. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
  258. transformers/models/lightglue/modeling_lightglue.py +3 -1
  259. transformers/models/lightglue/modular_lightglue.py +1 -0
  260. transformers/models/lilt/modeling_lilt.py +4 -0
  261. transformers/models/llama/modeling_llama.py +4 -4
  262. transformers/models/llama/tokenization_llama.py +15 -43
  263. transformers/models/llama4/modeling_llama4.py +3 -2
  264. transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
  265. transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
  266. transformers/models/longformer/modeling_longformer.py +6 -0
  267. transformers/models/longt5/modeling_longt5.py +4 -0
  268. transformers/models/luke/modeling_luke.py +9 -0
  269. transformers/models/luke/tokenization_luke.py +11 -38
  270. transformers/models/lxmert/modeling_lxmert.py +2 -0
  271. transformers/models/m2m_100/modeling_m2m_100.py +4 -0
  272. transformers/models/mamba/modeling_mamba.py +14 -22
  273. transformers/models/marian/modeling_marian.py +5 -0
  274. transformers/models/markuplm/modeling_markuplm.py +4 -0
  275. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  276. transformers/models/mask2former/modeling_mask2former.py +2 -0
  277. transformers/models/maskformer/modeling_maskformer.py +2 -0
  278. transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
  279. transformers/models/mbart/modeling_mbart.py +7 -0
  280. transformers/models/mbart/tokenization_mbart.py +11 -52
  281. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  282. transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
  283. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  284. transformers/models/mimi/modeling_mimi.py +3 -1
  285. transformers/models/minimax/modeling_minimax.py +4 -4
  286. transformers/models/ministral/modeling_ministral.py +4 -4
  287. transformers/models/ministral3/configuration_ministral3.py +1 -1
  288. transformers/models/ministral3/modeling_ministral3.py +4 -3
  289. transformers/models/mistral/modeling_mistral.py +4 -3
  290. transformers/models/mixtral/modeling_mixtral.py +4 -4
  291. transformers/models/mllama/modeling_mllama.py +2 -2
  292. transformers/models/mluke/tokenization_mluke.py +6 -6
  293. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
  294. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  295. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  296. transformers/models/mobilevit/modeling_mobilevit.py +3 -0
  297. transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
  298. transformers/models/modernbert/modeling_modernbert.py +4 -1
  299. transformers/models/modernbert/modular_modernbert.py +2 -0
  300. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
  301. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
  302. transformers/models/moonshine/modeling_moonshine.py +4 -2
  303. transformers/models/moshi/modeling_moshi.py +5 -2
  304. transformers/models/mpnet/modeling_mpnet.py +5 -0
  305. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  306. transformers/models/mpt/modeling_mpt.py +2 -0
  307. transformers/models/mra/modeling_mra.py +6 -0
  308. transformers/models/mt5/modeling_mt5.py +7 -0
  309. transformers/models/musicgen/modeling_musicgen.py +2 -0
  310. transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
  311. transformers/models/mvp/modeling_mvp.py +7 -0
  312. transformers/models/nanochat/modeling_nanochat.py +4 -4
  313. transformers/models/nemotron/modeling_nemotron.py +4 -2
  314. transformers/models/nllb/tokenization_nllb.py +8 -22
  315. transformers/models/nougat/tokenization_nougat.py +11 -59
  316. transformers/models/nystromformer/modeling_nystromformer.py +6 -0
  317. transformers/models/olmo/modeling_olmo.py +4 -4
  318. transformers/models/olmo/modular_olmo.py +2 -2
  319. transformers/models/olmo2/modeling_olmo2.py +4 -5
  320. transformers/models/olmo2/modular_olmo2.py +0 -1
  321. transformers/models/olmo3/modeling_olmo3.py +4 -4
  322. transformers/models/olmoe/modeling_olmoe.py +4 -4
  323. transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
  324. transformers/models/oneformer/modeling_oneformer.py +4 -1
  325. transformers/models/openai/modeling_openai.py +3 -0
  326. transformers/models/openai/tokenization_openai.py +10 -46
  327. transformers/models/opt/modeling_opt.py +2 -0
  328. transformers/models/owlv2/modeling_owlv2.py +4 -0
  329. transformers/models/owlvit/modeling_owlvit.py +4 -0
  330. transformers/models/paddleocr_vl/__init__.py +32 -0
  331. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  332. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
  333. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  334. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
  335. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
  336. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  337. transformers/models/parakeet/configuration_parakeet.py +4 -6
  338. transformers/models/parakeet/modeling_parakeet.py +9 -6
  339. transformers/models/parakeet/modular_parakeet.py +2 -2
  340. transformers/models/parakeet/processing_parakeet.py +1 -0
  341. transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
  342. transformers/models/patchtst/modeling_patchtst.py +20 -2
  343. transformers/models/pegasus/modeling_pegasus.py +5 -0
  344. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  345. transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
  346. transformers/models/perceiver/modeling_perceiver.py +8 -0
  347. transformers/models/persimmon/modeling_persimmon.py +2 -1
  348. transformers/models/phi/modeling_phi.py +4 -5
  349. transformers/models/phi/modular_phi.py +0 -1
  350. transformers/models/phi3/modeling_phi3.py +2 -1
  351. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
  352. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
  353. transformers/models/phimoe/modeling_phimoe.py +4 -4
  354. transformers/models/phimoe/modular_phimoe.py +2 -2
  355. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  356. transformers/models/pixtral/modeling_pixtral.py +2 -1
  357. transformers/models/plbart/modeling_plbart.py +6 -0
  358. transformers/models/plbart/modular_plbart.py +2 -0
  359. transformers/models/plbart/tokenization_plbart.py +0 -2
  360. transformers/models/poolformer/modeling_poolformer.py +2 -0
  361. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  362. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  363. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  364. transformers/models/prophetnet/modeling_prophetnet.py +3 -0
  365. transformers/models/pvt/modeling_pvt.py +2 -0
  366. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  367. transformers/models/qwen2/modeling_qwen2.py +4 -4
  368. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  369. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  370. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
  371. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
  372. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  373. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
  374. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
  375. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
  376. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  377. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  378. transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
  379. transformers/models/qwen3/modeling_qwen3.py +4 -4
  380. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  381. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
  382. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
  383. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
  384. transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
  385. transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
  386. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
  387. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
  388. transformers/models/rag/modeling_rag.py +1 -0
  389. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
  390. transformers/models/reformer/modeling_reformer.py +4 -0
  391. transformers/models/reformer/tokenization_reformer.py +11 -28
  392. transformers/models/regnet/modeling_regnet.py +6 -1
  393. transformers/models/rembert/modeling_rembert.py +6 -0
  394. transformers/models/rembert/tokenization_rembert.py +3 -10
  395. transformers/models/resnet/modeling_resnet.py +11 -2
  396. transformers/models/roberta/tokenization_roberta.py +18 -27
  397. transformers/models/roformer/modeling_roformer.py +6 -0
  398. transformers/models/roformer/tokenization_roformer.py +77 -412
  399. transformers/models/rt_detr/modeling_rt_detr.py +2 -0
  400. transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
  401. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
  402. transformers/models/rwkv/modeling_rwkv.py +1 -0
  403. transformers/models/sam2/modeling_sam2.py +2 -2
  404. transformers/models/sam2/modular_sam2.py +2 -2
  405. transformers/models/sam2_video/modeling_sam2_video.py +1 -0
  406. transformers/models/sam2_video/modular_sam2_video.py +1 -0
  407. transformers/models/sam3/modeling_sam3.py +77 -80
  408. transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
  409. transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
  410. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
  411. transformers/models/sam3_video/modeling_sam3_video.py +1 -0
  412. transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
  413. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  414. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
  415. transformers/models/seed_oss/modeling_seed_oss.py +2 -2
  416. transformers/models/segformer/modeling_segformer.py +4 -1
  417. transformers/models/seggpt/modeling_seggpt.py +2 -0
  418. transformers/models/sew/modeling_sew.py +3 -0
  419. transformers/models/sew/modular_sew.py +1 -0
  420. transformers/models/sew_d/modeling_sew_d.py +3 -0
  421. transformers/models/siglip2/modeling_siglip2.py +4 -0
  422. transformers/models/siglip2/modular_siglip2.py +4 -0
  423. transformers/models/smollm3/modeling_smollm3.py +4 -4
  424. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  425. transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
  426. transformers/models/speecht5/modeling_speecht5.py +13 -1
  427. transformers/models/splinter/modeling_splinter.py +3 -0
  428. transformers/models/splinter/tokenization_splinter.py +9 -28
  429. transformers/models/squeezebert/modeling_squeezebert.py +6 -0
  430. transformers/models/stablelm/modeling_stablelm.py +3 -1
  431. transformers/models/starcoder2/modeling_starcoder2.py +4 -3
  432. transformers/models/superglue/modeling_superglue.py +1 -0
  433. transformers/models/superpoint/modeling_superpoint.py +1 -0
  434. transformers/models/swiftformer/modeling_swiftformer.py +2 -0
  435. transformers/models/swin/modeling_swin.py +4 -0
  436. transformers/models/swin2sr/modeling_swin2sr.py +2 -0
  437. transformers/models/swinv2/modeling_swinv2.py +4 -0
  438. transformers/models/t5/modeling_t5.py +7 -0
  439. transformers/models/t5/tokenization_t5.py +4 -8
  440. transformers/models/t5gemma/modeling_t5gemma.py +5 -5
  441. transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
  442. transformers/models/table_transformer/modeling_table_transformer.py +4 -0
  443. transformers/models/tapas/modeling_tapas.py +3 -0
  444. transformers/models/textnet/modeling_textnet.py +11 -2
  445. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  446. transformers/models/timesfm/modeling_timesfm.py +2 -0
  447. transformers/models/timesfm/modular_timesfm.py +2 -0
  448. transformers/models/timesformer/modeling_timesformer.py +2 -0
  449. transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
  450. transformers/models/trocr/modeling_trocr.py +2 -0
  451. transformers/models/tvp/modeling_tvp.py +2 -0
  452. transformers/models/udop/modeling_udop.py +4 -0
  453. transformers/models/udop/tokenization_udop.py +5 -13
  454. transformers/models/umt5/modeling_umt5.py +7 -0
  455. transformers/models/unispeech/modeling_unispeech.py +4 -0
  456. transformers/models/unispeech/modular_unispeech.py +2 -0
  457. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  458. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  459. transformers/models/univnet/modeling_univnet.py +1 -0
  460. transformers/models/upernet/modeling_upernet.py +1 -0
  461. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  462. transformers/models/vilt/modeling_vilt.py +6 -0
  463. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  464. transformers/models/visual_bert/modeling_visual_bert.py +6 -0
  465. transformers/models/vitdet/modeling_vitdet.py +2 -0
  466. transformers/models/vitmatte/modeling_vitmatte.py +1 -0
  467. transformers/models/vits/modeling_vits.py +1 -0
  468. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  469. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  470. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
  471. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
  472. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
  473. transformers/models/wavlm/modeling_wavlm.py +5 -0
  474. transformers/models/whisper/modeling_whisper.py +6 -0
  475. transformers/models/whisper/tokenization_whisper.py +4 -15
  476. transformers/models/x_clip/modeling_x_clip.py +3 -0
  477. transformers/models/xglm/modeling_xglm.py +1 -0
  478. transformers/models/xglm/tokenization_xglm.py +4 -9
  479. transformers/models/xlm/modeling_xlm.py +5 -0
  480. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  481. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  482. transformers/models/yoso/modeling_yoso.py +6 -0
  483. transformers/models/zamba/modeling_zamba.py +2 -0
  484. transformers/models/zamba2/modeling_zamba2.py +4 -2
  485. transformers/models/zamba2/modular_zamba2.py +1 -1
  486. transformers/models/zoedepth/modeling_zoedepth.py +1 -0
  487. transformers/pipelines/__init__.py +2 -3
  488. transformers/pipelines/base.py +1 -9
  489. transformers/pipelines/document_question_answering.py +3 -1
  490. transformers/pipelines/text_generation.py +1 -1
  491. transformers/processing_utils.py +23 -11
  492. transformers/quantizers/base.py +35 -110
  493. transformers/quantizers/quantizer_aqlm.py +1 -5
  494. transformers/quantizers/quantizer_auto_round.py +1 -2
  495. transformers/quantizers/quantizer_awq.py +17 -81
  496. transformers/quantizers/quantizer_bitnet.py +3 -8
  497. transformers/quantizers/quantizer_bnb_4bit.py +13 -110
  498. transformers/quantizers/quantizer_bnb_8bit.py +16 -92
  499. transformers/quantizers/quantizer_compressed_tensors.py +1 -5
  500. transformers/quantizers/quantizer_eetq.py +14 -62
  501. transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
  502. transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
  503. transformers/quantizers/quantizer_fp_quant.py +48 -78
  504. transformers/quantizers/quantizer_gptq.py +7 -24
  505. transformers/quantizers/quantizer_higgs.py +40 -54
  506. transformers/quantizers/quantizer_hqq.py +144 -153
  507. transformers/quantizers/quantizer_mxfp4.py +13 -167
  508. transformers/quantizers/quantizer_quanto.py +20 -64
  509. transformers/quantizers/quantizer_quark.py +36 -17
  510. transformers/quantizers/quantizer_spqr.py +1 -4
  511. transformers/quantizers/quantizer_torchao.py +23 -202
  512. transformers/quantizers/quantizer_vptq.py +8 -22
  513. transformers/quantizers/quantizers_utils.py +20 -0
  514. transformers/testing_utils.py +297 -36
  515. transformers/tokenization_mistral_common.py +4 -0
  516. transformers/tokenization_utils_base.py +113 -222
  517. transformers/tokenization_utils_tokenizers.py +168 -107
  518. transformers/trainer.py +28 -31
  519. transformers/trainer_jit_checkpoint.py +126 -0
  520. transformers/trainer_utils.py +1 -1
  521. transformers/training_args.py +66 -28
  522. transformers/utils/__init__.py +3 -4
  523. transformers/utils/auto_docstring.py +1 -0
  524. transformers/utils/generic.py +27 -1
  525. transformers/utils/hub.py +5 -15
  526. transformers/utils/import_utils.py +61 -16
  527. transformers/utils/kernel_config.py +4 -2
  528. transformers/utils/loading_report.py +19 -10
  529. transformers/utils/quantization_config.py +75 -242
  530. transformers/video_processing_utils.py +1 -2
  531. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
  532. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
  533. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
  534. transformers/kernels/__init__.py +0 -0
  535. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  536. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  537. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
  538. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  539. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,7 @@ from torch import nn
27
27
  from ...activations import ACT2FN
28
28
  from ...cache_utils import Cache, DynamicCache
29
29
  from ...generation import GenerationMixin
30
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
30
+ from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
31
31
  from ...masking_utils import create_causal_mask
32
32
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
33
33
  from ...modeling_layers import GradientCheckpointingLayer
@@ -36,7 +36,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
36
36
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
37
37
  from ...processing_utils import Unpack
38
38
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
39
- from ...utils.generic import check_model_inputs
39
+ from ...utils.generic import check_model_inputs, maybe_autocast
40
40
  from .configuration_bitnet import BitNetConfig
41
41
 
42
42
 
@@ -151,6 +151,7 @@ def eager_attention_forward(
151
151
  return attn_output, attn_weights
152
152
 
153
153
 
154
+ @use_kernelized_func(apply_rotary_pos_emb)
154
155
  class BitNetAttention(nn.Module):
155
156
  """Multi-headed attention from 'Attention Is All You Need' paper"""
156
157
 
@@ -176,7 +177,6 @@ class BitNetAttention(nn.Module):
176
177
  self.o_proj = nn.Linear(
177
178
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
178
179
  )
179
- self.rotary_fn = apply_rotary_pos_emb
180
180
  self.attn_sub_norm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
181
181
 
182
182
  def forward(
@@ -326,7 +326,7 @@ class BitNetRotaryEmbedding(nn.Module):
326
326
  position_ids_expanded = position_ids[:, None, :].float()
327
327
 
328
328
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
329
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
329
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
330
330
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
331
331
  emb = torch.cat((freqs, freqs), dim=-1)
332
332
  cos = emb.cos() * self.attention_scaling
@@ -493,6 +493,7 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel):
493
493
  output_attentions=None,
494
494
  output_hidden_states=None,
495
495
  return_dict=None,
496
+ **kwargs,
496
497
  ):
497
498
  r"""
498
499
  Args:
@@ -643,6 +644,7 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
643
644
  output_hidden_states=None,
644
645
  return_dict=None,
645
646
  cache_position: Optional[torch.Tensor] = None,
647
+ **kwargs,
646
648
  ):
647
649
  r"""
648
650
  Args:
@@ -885,6 +887,7 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
885
887
  output_hidden_states: Optional[bool] = None,
886
888
  return_dict: Optional[bool] = None,
887
889
  cache_position: Optional[torch.Tensor] = None,
890
+ **kwargs,
888
891
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
889
892
  r"""
890
893
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1039,6 +1042,7 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel, GenerationMi
1039
1042
  output_hidden_states: Optional[bool] = None,
1040
1043
  return_dict: Optional[bool] = None,
1041
1044
  cache_position: Optional[torch.Tensor] = None,
1045
+ **kwargs,
1042
1046
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
1043
1047
  r"""
1044
1048
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1196,6 +1200,7 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel, GenerationMixin):
1196
1200
  return_dict: Optional[bool] = None,
1197
1201
  cache_position: Optional[torch.LongTensor] = None,
1198
1202
  logits_to_keep: Union[int, torch.Tensor] = 0,
1203
+ **kwargs,
1199
1204
  ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
1200
1205
  r"""
1201
1206
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -102,14 +102,15 @@ class BlenderbotTokenizer(TokenizersBackend):
102
102
  add_prefix_space (`bool`, *optional*, defaults to `True`):
103
103
  Whether or not to add an initial space to the input. This allows to treat the leading word just as any
104
104
  other word. (Blenderbot tokenizer detect beginning of words by the preceding space).
105
- vocab (`dict`, *optional*):
106
- Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
107
- merges (`list`, *optional*):
108
- Custom merges list. If not provided, merges are loaded from merges_file.
105
+ vocab (`str` or `dict[str, int]`, *optional*):
106
+ Custom vocabulary dictionary. If not provided, vocabulary is loaded from `vocab_file`.
107
+ merges (`str` or `list[str]`, *optional*):
108
+ Custom merges list. If not provided, merges are loaded from `merges_file`.
109
109
  """
110
110
 
111
111
  vocab_files_names = VOCAB_FILES_NAMES
112
112
  model_input_names = ["input_ids", "attention_mask"]
113
+ model = BPE
113
114
 
114
115
  def __init__(
115
116
  self,
@@ -132,22 +133,20 @@ class BlenderbotTokenizer(TokenizersBackend):
132
133
  else mask_token
133
134
  )
134
135
 
135
- if vocab is not None and merges is not None:
136
- self._vocab = (
137
- {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
138
- )
139
- self._merges = merges
140
- else:
141
- # Initialize with minimal vocab
142
- self._vocab = {
136
+ # Initialize vocab and merges; when not provided fall back to minimal vocab
137
+ self._vocab = (
138
+ vocab
139
+ if vocab is not None
140
+ else {
143
141
  str(bos_token): 0,
144
142
  str(pad_token): 1,
145
143
  str(eos_token): 2,
146
144
  str(unk_token): 3,
147
145
  str(mask_token): 4,
148
146
  }
149
- self._merges = []
147
+ )
150
148
 
149
+ self._merges = merges or []
151
150
  self._tokenizer = Tokenizer(
152
151
  BPE(
153
152
  vocab=self._vocab,
@@ -168,10 +167,7 @@ class BlenderbotTokenizer(TokenizersBackend):
168
167
  trim_offsets=True,
169
168
  )
170
169
 
171
- tokenizer_object = self._tokenizer
172
-
173
170
  super().__init__(
174
- tokenizer_object=tokenizer_object,
175
171
  bos_token=bos_token,
176
172
  eos_token=eos_token,
177
173
  sep_token=sep_token,
@@ -484,6 +484,7 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
484
484
  output_attentions=None,
485
485
  output_hidden_states=None,
486
486
  return_dict=None,
487
+ **kwargs,
487
488
  ):
488
489
  r"""
489
490
  Args:
@@ -630,6 +631,7 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
630
631
  output_hidden_states=None,
631
632
  return_dict=None,
632
633
  cache_position=None,
634
+ **kwargs,
633
635
  ):
634
636
  r"""
635
637
  Args:
@@ -858,6 +860,7 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
858
860
  output_hidden_states: Optional[bool] = None,
859
861
  return_dict: Optional[bool] = None,
860
862
  cache_position: Optional[torch.Tensor] = None,
863
+ **kwargs,
861
864
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
862
865
  r"""
863
866
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -999,6 +1002,7 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel, Ge
999
1002
  output_hidden_states: Optional[bool] = None,
1000
1003
  return_dict: Optional[bool] = None,
1001
1004
  cache_position: Optional[torch.Tensor] = None,
1005
+ **kwargs,
1002
1006
  ) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
1003
1007
  r"""
1004
1008
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1156,6 +1160,7 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel, GenerationMixin
1156
1160
  return_dict: Optional[bool] = None,
1157
1161
  cache_position: Optional[torch.LongTensor] = None,
1158
1162
  logits_to_keep: Union[int, torch.Tensor] = 0,
1163
+ **kwargs,
1159
1164
  ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
1160
1165
  r"""
1161
1166
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -609,6 +609,7 @@ class BlipTextModel(BlipTextPreTrainedModel):
609
609
  return_dict: Optional[bool] = None,
610
610
  is_decoder: Optional[bool] = False,
611
611
  cache_position: Optional[torch.Tensor] = None,
612
+ **kwargs,
612
613
  ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
613
614
  r"""
614
615
  encoder_hidden_states (`torch.FloatTensor`, *optional*):
@@ -771,6 +772,7 @@ class BlipTextLMHeadModel(BlipTextPreTrainedModel, GenerationMixin):
771
772
  reduction: Optional[str] = "mean",
772
773
  cache_position: Optional[torch.Tensor] = None,
773
774
  logits_to_keep: Union[int, torch.Tensor] = 0,
775
+ **kwargs,
774
776
  ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
775
777
  r"""
776
778
  encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of
@@ -603,7 +603,7 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
603
603
 
604
604
  # This is actually dropping out entire tokens to attend to, which might
605
605
  # seem a bit unusual, but is taken from the original Transformer paper.
606
- attention_probs_dropped = self.dropout(attention_probs)
606
+ attention_probs_dropped = self.dropout(attention_probs).to(value_layer.dtype)
607
607
 
608
608
  context_layer = torch.matmul(attention_probs_dropped, value_layer)
609
609
 
@@ -1948,6 +1948,7 @@ class Blip2ForImageTextRetrieval(Blip2PreTrainedModel):
1948
1948
  output_attentions: Optional[bool] = None,
1949
1949
  output_hidden_states: Optional[bool] = None,
1950
1950
  return_dict: Optional[bool] = None,
1951
+ **kwargs,
1951
1952
  ) -> Union[tuple, Blip2ImageTextMatchingModelOutput]:
1952
1953
  r"""
1953
1954
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -465,6 +465,7 @@ class BloomModel(BloomPreTrainedModel):
465
465
  output_hidden_states: Optional[bool] = None,
466
466
  return_dict: Optional[bool] = None,
467
467
  cache_position: Optional[torch.LongTensor] = None,
468
+ **kwargs,
468
469
  ) -> Union[tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
469
470
  r"""
470
471
  input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
@@ -883,6 +884,7 @@ class BloomForSequenceClassification(BloomPreTrainedModel):
883
884
  output_attentions: Optional[bool] = None,
884
885
  output_hidden_states: Optional[bool] = None,
885
886
  return_dict: Optional[bool] = None,
887
+ **kwargs,
886
888
  ) -> Union[tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
887
889
  r"""
888
890
  input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
@@ -1006,6 +1008,7 @@ class BloomForTokenClassification(BloomPreTrainedModel):
1006
1008
  output_attentions: Optional[bool] = None,
1007
1009
  output_hidden_states: Optional[bool] = None,
1008
1010
  return_dict: Optional[bool] = None,
1011
+ **kwargs,
1009
1012
  ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
1010
1013
  r"""
1011
1014
  input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
@@ -1084,6 +1087,7 @@ class BloomForQuestionAnswering(BloomPreTrainedModel):
1084
1087
  output_attentions: Optional[bool] = None,
1085
1088
  output_hidden_states: Optional[bool] = None,
1086
1089
  return_dict: Optional[bool] = None,
1090
+ **kwargs,
1087
1091
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
1088
1092
  r"""
1089
1093
  input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
@@ -38,7 +38,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
38
38
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
39
39
  from ...processing_utils import Unpack
40
40
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
41
- from ...utils.generic import OutputRecorder, check_model_inputs
41
+ from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
42
42
  from .configuration_blt import (
43
43
  BltConfig,
44
44
  BltGlobalTransformerConfig,
@@ -141,7 +141,7 @@ class BltRotaryEmbedding(nn.Module):
141
141
  position_ids_expanded = position_ids[:, None, :].float()
142
142
 
143
143
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
144
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
144
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
145
145
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
146
146
  emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
147
147
  cos = emb.cos() * self.attention_scaling
@@ -29,7 +29,7 @@ from ...modeling_rope_utils import dynamic_rope_update
29
29
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
30
30
  from ...processing_utils import Unpack
31
31
  from ...utils import TransformersKwargs, auto_docstring, logging
32
- from ...utils.generic import OutputRecorder, check_model_inputs
32
+ from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
33
33
  from ..cohere2.modeling_cohere2 import rotate_half # noqa: F401
34
34
  from ..llama.modeling_llama import LlamaRotaryEmbedding
35
35
  from ..mllama.modeling_mllama import (
@@ -277,7 +277,7 @@ class BltRotaryEmbedding(LlamaRotaryEmbedding):
277
277
  position_ids_expanded = position_ids[:, None, :].float()
278
278
 
279
279
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
280
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
280
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
281
281
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
282
282
  emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
283
283
  cos = emb.cos() * self.attention_scaling
@@ -960,7 +960,7 @@ class BridgeTowerVisionModel(BridgeTowerPreTrainedModel):
960
960
  def dtype(self):
961
961
  return self.visual.embeddings.patch_embedding.weight.dtype
962
962
 
963
- def forward(self, image, image_mask=None, interpolate_pos_encoding=False):
963
+ def forward(self, image, image_mask=None, interpolate_pos_encoding=False, **kwargs):
964
964
  return self.visual(image.type(self.dtype), image_mask, interpolate_pos_encoding)
965
965
 
966
966
 
@@ -1223,6 +1223,7 @@ class BridgeTowerModel(BridgeTowerPreTrainedModel):
1223
1223
  return_dict: Optional[bool] = None,
1224
1224
  labels: Optional[torch.LongTensor] = None,
1225
1225
  interpolate_pos_encoding: bool = False,
1226
+ **kwargs,
1226
1227
  ) -> Union[tuple[torch.Tensor], BridgeTowerModelOutput]:
1227
1228
  r"""
1228
1229
  image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
@@ -1530,6 +1531,7 @@ class BridgeTowerForMaskedLM(BridgeTowerPreTrainedModel):
1530
1531
  output_hidden_states: Optional[bool] = None,
1531
1532
  return_dict: Optional[bool] = None,
1532
1533
  labels: Optional[torch.LongTensor] = None,
1534
+ **kwargs,
1533
1535
  ) -> Union[MaskedLMOutput, tuple[torch.FloatTensor]]:
1534
1536
  r"""
1535
1537
  image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
@@ -1630,6 +1632,7 @@ class BridgeTowerForImageAndTextRetrieval(BridgeTowerPreTrainedModel):
1630
1632
  output_hidden_states: Optional[bool] = None,
1631
1633
  return_dict: Optional[bool] = None,
1632
1634
  labels: Optional[torch.LongTensor] = None,
1635
+ **kwargs,
1633
1636
  ) -> Union[SequenceClassifierOutput, tuple[torch.FloatTensor]]:
1634
1637
  r"""
1635
1638
  image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
@@ -1742,6 +1745,7 @@ class BridgeTowerForContrastiveLearning(BridgeTowerPreTrainedModel):
1742
1745
  output_hidden_states: Optional[bool] = True,
1743
1746
  return_dict: Optional[bool] = None,
1744
1747
  return_loss: Optional[bool] = None,
1748
+ **kwargs,
1745
1749
  ) -> Union[BridgeTowerContrastiveOutput, tuple[torch.FloatTensor]]:
1746
1750
  r"""
1747
1751
  image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
@@ -563,6 +563,7 @@ class BrosModel(BrosPreTrainedModel):
563
563
  output_attentions: Optional[bool] = None,
564
564
  output_hidden_states: Optional[bool] = None,
565
565
  return_dict: Optional[bool] = None,
566
+ **kwargs,
566
567
  ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
567
568
  r"""
568
569
  bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
@@ -701,6 +702,7 @@ class BrosForTokenClassification(BrosPreTrainedModel):
701
702
  output_attentions: Optional[bool] = None,
702
703
  output_hidden_states: Optional[bool] = None,
703
704
  return_dict: Optional[bool] = None,
705
+ **kwargs,
704
706
  ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
705
707
  r"""
706
708
  bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
@@ -821,6 +823,7 @@ class BrosSpadeEEForTokenClassification(BrosPreTrainedModel):
821
823
  output_attentions: Optional[bool] = None,
822
824
  output_hidden_states: Optional[bool] = None,
823
825
  return_dict: Optional[bool] = None,
826
+ **kwargs,
824
827
  ) -> Union[tuple[torch.Tensor], BrosSpadeOutput]:
825
828
  r"""
826
829
  bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
@@ -957,6 +960,7 @@ class BrosSpadeELForTokenClassification(BrosPreTrainedModel):
957
960
  output_attentions: Optional[bool] = None,
958
961
  output_hidden_states: Optional[bool] = None,
959
962
  return_dict: Optional[bool] = None,
963
+ **kwargs,
960
964
  ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
961
965
  r"""
962
966
  bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
@@ -14,6 +14,8 @@
14
14
  # limitations under the License
15
15
  """Tokenization classes for Camembert model."""
16
16
 
17
+ from typing import Optional, Union
18
+
17
19
  from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
18
20
  from tokenizers.models import Unigram
19
21
 
@@ -83,7 +85,7 @@ class CamembertTokenizer(TokenizersBackend):
83
85
  vocab_file (`str`, *optional*):
84
86
  [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
85
87
  contains the vocabulary necessary to instantiate a tokenizer.
86
- vocab (`dict`, *optional*):
88
+ vocab (`str`, `dict` or `list`, *optional*):
87
89
  Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
88
90
  """
89
91
 
@@ -103,7 +105,7 @@ class CamembertTokenizer(TokenizersBackend):
103
105
  additional_special_tokens=None,
104
106
  add_prefix_space=True,
105
107
  vocab_file=None,
106
- vocab=None,
108
+ vocab: Optional[Union[str, dict, list]] = None,
107
109
  **kwargs,
108
110
  ):
109
111
  self.vocab_file = vocab_file
@@ -114,9 +116,9 @@ class CamembertTokenizer(TokenizersBackend):
114
116
  if additional_special_tokens is None:
115
117
  additional_special_tokens = ["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"]
116
118
 
117
- if vocab is not None and isinstance(vocab, list):
118
- self._vocab = list(vocab)
119
- unk_index = next(i for i, (tok, _) in enumerate(self._vocab) if tok == str(unk_token))
119
+ if vocab is not None:
120
+ self._vocab = vocab
121
+ unk_index = next((i for i, (tok, _) in enumerate(self._vocab) if tok == str(unk_token)), 0)
120
122
  self._tokenizer = Tokenizer(Unigram(self._vocab, unk_id=unk_index, byte_fallback=False))
121
123
  else:
122
124
  self._vocab = [
@@ -131,11 +133,8 @@ class CamembertTokenizer(TokenizersBackend):
131
133
 
132
134
  self._tokenizer.normalizer = normalizers.Sequence(
133
135
  [
134
- normalizers.Replace("\n", " "),
135
- normalizers.Replace("\r", " "),
136
- normalizers.Replace("\t", " "),
136
+ normalizers.Replace(Regex(r"\s{2,}|[\n\r\t]"), " "),
137
137
  normalizers.Strip(left=False, right=True),
138
- normalizers.Replace(Regex(" {2,}"), "▁"),
139
138
  ]
140
139
  )
141
140
 
@@ -143,10 +142,7 @@ class CamembertTokenizer(TokenizersBackend):
143
142
  self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
144
143
  self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
145
144
 
146
- tokenizer_object = self._tokenizer
147
-
148
145
  super().__init__(
149
- tokenizer_object=tokenizer_object,
150
146
  bos_token=bos_token,
151
147
  eos_token=eos_token,
152
148
  sep_token=sep_token,
@@ -836,6 +836,7 @@ class CanineModel(CaninePreTrainedModel):
836
836
  output_attentions: Optional[bool] = None,
837
837
  output_hidden_states: Optional[bool] = None,
838
838
  return_dict: Optional[bool] = None,
839
+ **kwargs,
839
840
  ) -> Union[tuple, CanineModelOutputWithPooling]:
840
841
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
841
842
  output_hidden_states = (
@@ -1006,6 +1007,7 @@ class CanineForSequenceClassification(CaninePreTrainedModel):
1006
1007
  output_attentions: Optional[bool] = None,
1007
1008
  output_hidden_states: Optional[bool] = None,
1008
1009
  return_dict: Optional[bool] = None,
1010
+ **kwargs,
1009
1011
  ) -> Union[tuple, SequenceClassifierOutput]:
1010
1012
  r"""
1011
1013
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1089,6 +1091,7 @@ class CanineForMultipleChoice(CaninePreTrainedModel):
1089
1091
  output_attentions: Optional[bool] = None,
1090
1092
  output_hidden_states: Optional[bool] = None,
1091
1093
  return_dict: Optional[bool] = None,
1094
+ **kwargs,
1092
1095
  ) -> Union[tuple, MultipleChoiceModelOutput]:
1093
1096
  r"""
1094
1097
  input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
@@ -1192,6 +1195,7 @@ class CanineForTokenClassification(CaninePreTrainedModel):
1192
1195
  output_attentions: Optional[bool] = None,
1193
1196
  output_hidden_states: Optional[bool] = None,
1194
1197
  return_dict: Optional[bool] = None,
1198
+ **kwargs,
1195
1199
  ) -> Union[tuple, TokenClassifierOutput]:
1196
1200
  r"""
1197
1201
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1287,6 +1291,7 @@ class CanineForQuestionAnswering(CaninePreTrainedModel):
1287
1291
  output_attentions: Optional[bool] = None,
1288
1292
  output_hidden_states: Optional[bool] = None,
1289
1293
  return_dict: Optional[bool] = None,
1294
+ **kwargs,
1290
1295
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
1291
1296
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1292
1297
 
@@ -38,6 +38,7 @@ from ...utils import (
38
38
  can_return_tuple,
39
39
  logging,
40
40
  )
41
+ from ...utils.generic import maybe_autocast
41
42
  from .configuration_chameleon import ChameleonConfig, ChameleonVQVAEConfig
42
43
 
43
44
 
@@ -122,7 +123,7 @@ class ChameleonRotaryEmbedding(nn.Module):
122
123
  position_ids_expanded = position_ids[:, None, :].float()
123
124
 
124
125
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
125
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
126
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
126
127
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
127
128
  emb = torch.cat((freqs, freqs), dim=-1)
128
129
  cos = emb.cos() * self.attention_scaling
@@ -839,6 +839,7 @@ class ChineseCLIPTextModel(ChineseCLIPPreTrainedModel):
839
839
  output_attentions: Optional[bool] = None,
840
840
  output_hidden_states: Optional[bool] = None,
841
841
  return_dict: Optional[bool] = None,
842
+ **kwargs,
842
843
  ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPooling]:
843
844
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
844
845
  output_hidden_states = (
@@ -926,6 +927,7 @@ class ChineseCLIPVisionModel(ChineseCLIPPreTrainedModel):
926
927
  output_hidden_states: Optional[bool] = None,
927
928
  interpolate_pos_encoding: bool = False,
928
929
  return_dict: Optional[bool] = None,
930
+ **kwargs,
929
931
  ) -> Union[tuple, BaseModelOutputWithPooling]:
930
932
  r"""
931
933
  Examples:
@@ -1091,6 +1093,7 @@ class ChineseCLIPModel(ChineseCLIPPreTrainedModel):
1091
1093
  output_hidden_states: Optional[bool] = None,
1092
1094
  interpolate_pos_encoding: bool = False,
1093
1095
  return_dict: Optional[bool] = None,
1096
+ **kwargs,
1094
1097
  ) -> Union[tuple, ChineseCLIPOutput]:
1095
1098
  r"""
1096
1099
  return_loss (`bool`, *optional*):
@@ -1356,6 +1356,7 @@ class ClapAudioModel(ClapPreTrainedModel):
1356
1356
  output_attentions: Optional[bool] = None,
1357
1357
  output_hidden_states: Optional[bool] = None,
1358
1358
  return_dict: Optional[bool] = None,
1359
+ **kwargs,
1359
1360
  ) -> Union[tuple, BaseModelOutputWithPooling]:
1360
1361
  r"""
1361
1362
  is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
@@ -1446,6 +1447,7 @@ class ClapTextModel(ClapPreTrainedModel):
1446
1447
  output_attentions: Optional[bool] = None,
1447
1448
  output_hidden_states: Optional[bool] = None,
1448
1449
  return_dict: Optional[bool] = None,
1450
+ **kwargs,
1449
1451
  ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
1450
1452
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1451
1453
  output_hidden_states = (
@@ -1627,6 +1629,7 @@ class ClapModel(ClapPreTrainedModel):
1627
1629
  output_attentions: Optional[bool] = None,
1628
1630
  output_hidden_states: Optional[bool] = None,
1629
1631
  return_dict: Optional[bool] = None,
1632
+ **kwargs,
1630
1633
  ) -> Union[tuple, ClapOutput]:
1631
1634
  r"""
1632
1635
  is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
@@ -1740,6 +1743,7 @@ class ClapTextModelWithProjection(ClapPreTrainedModel):
1740
1743
  output_attentions: Optional[bool] = None,
1741
1744
  output_hidden_states: Optional[bool] = None,
1742
1745
  return_dict: Optional[bool] = None,
1746
+ **kwargs,
1743
1747
  ) -> Union[tuple, ClapTextModelOutput]:
1744
1748
  r"""
1745
1749
  Examples:
@@ -1803,6 +1807,7 @@ class ClapAudioModelWithProjection(ClapPreTrainedModel):
1803
1807
  output_attentions: Optional[bool] = None,
1804
1808
  output_hidden_states: Optional[bool] = None,
1805
1809
  return_dict: Optional[bool] = None,
1810
+ **kwargs,
1806
1811
  ) -> Union[tuple, ClapAudioModelOutput]:
1807
1812
  r"""
1808
1813
  is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):