transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -67,6 +67,10 @@ class CodeGenTokenizer(TokenizersBackend):
67
67
  refer to this superclass for more information regarding those methods.
68
68
 
69
69
  Args:
70
+ vocab (`str` or `dict[str, int]`, *optional*):
71
+ Custom vocabulary dictionary. If not provided, vocabulary is loaded from `vocab_file`.
72
+ merges (`str` or `list[str]`, *optional*):
73
+ Custom merges list. If not provided, merges are loaded from `merges_file`.
70
74
  unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
71
75
  The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
72
76
  token instead.
@@ -79,31 +83,24 @@ class CodeGenTokenizer(TokenizersBackend):
79
83
  add_prefix_space (`bool`, *optional*, defaults to `False`):
80
84
  Whether or not to add an initial space to the input. This allows to treat the leading word just as any
81
85
  other word. (CodeGen tokenizer detect beginning of words by the preceding space).
82
- add_bos_token (`bool`, *optional*, defaults to `False`):
83
- Whether or not to add an initial beginning of sentence token to the input.
84
86
  return_token_type_ids (`bool`, *optional*, defaults to `False`):
85
87
  Whether to return token type IDs.
86
- vocab (`dict`, *optional*):
87
- Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
88
- merges (`list`, *optional*):
89
- Custom merges list. If not provided, merges are loaded from merges_file.
90
88
  """
91
89
 
92
90
  vocab_files_names = VOCAB_FILES_NAMES
93
91
  model_input_names = ["input_ids", "attention_mask"]
94
- slow_tokenizer_class = None
92
+ model = BPE
95
93
 
96
94
  def __init__(
97
95
  self,
98
- unk_token="<|endoftext|>",
99
- bos_token="<|endoftext|>",
100
- eos_token="<|endoftext|>",
96
+ vocab: Optional[Union[str, dict[str, int]]] = None,
97
+ merges: Optional[Union[str, list[str]]] = None,
98
+ unk_token: str = "<|endoftext|>",
99
+ bos_token: str = "<|endoftext|>",
100
+ eos_token: str = "<|endoftext|>",
101
101
  pad_token=None,
102
- add_prefix_space=False,
103
- add_bos_token=False,
104
- return_token_type_ids=False,
105
- vocab: Optional[dict] = None,
106
- merges: Optional[list] = None,
102
+ add_prefix_space: bool = False,
103
+ return_token_type_ids: bool = False,
107
104
  **kwargs,
108
105
  ):
109
106
  self.return_token_type_ids = return_token_type_ids
@@ -112,17 +109,8 @@ class CodeGenTokenizer(TokenizersBackend):
112
109
 
113
110
  self.add_prefix_space = add_prefix_space
114
111
 
115
- if vocab is not None:
116
- self._vocab = (
117
- {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
118
- )
119
- else:
120
- self._vocab = {}
121
-
122
- if merges is not None:
123
- self._merges = merges
124
- else:
125
- self._merges = []
112
+ self._vocab = vocab if vocab is not None else {}
113
+ self._merges = merges or []
126
114
 
127
115
  self._tokenizer = Tokenizer(
128
116
  BPE(
@@ -141,33 +129,16 @@ class CodeGenTokenizer(TokenizersBackend):
141
129
  add_prefix_space=True, use_regex=True, trim_offsets=False
142
130
  )
143
131
 
144
- tokenizer_object = self._tokenizer
145
-
146
- # Set these before calling super().__init__() so the base class _post_init() can use them
147
- self._add_bos_token = add_bos_token
148
- self._add_eos_token = False
149
-
150
132
  super().__init__(
151
- tokenizer_object=tokenizer_object,
152
133
  unk_token=unk_token,
153
134
  bos_token=bos_token,
154
135
  eos_token=eos_token,
155
136
  pad_token=pad_token,
156
137
  add_prefix_space=add_prefix_space,
157
- add_bos_token=add_bos_token,
158
138
  return_token_type_ids=return_token_type_ids,
159
139
  **kwargs,
160
140
  )
161
141
 
162
- self._post_init()
163
-
164
- def _post_init(self):
165
- self._tokenizer.post_processor = processors.ByteLevel(
166
- add_prefix_space=True, use_regex=True, trim_offsets=False
167
- )
168
- # Ensure base class post-init runs to register special/extra tokens, etc.
169
- super()._post_init()
170
-
171
142
  def decode(
172
143
  self,
173
144
  token_ids: Union[int, list[int], np.ndarray, "torch.Tensor"],
@@ -36,6 +36,7 @@ from torch import nn
36
36
  from ...activations import ACT2FN
37
37
  from ...cache_utils import Cache, DynamicCache
38
38
  from ...generation import GenerationMixin
39
+ from ...integrations import use_kernelized_func
39
40
  from ...masking_utils import create_causal_mask
40
41
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
41
42
  from ...modeling_layers import GradientCheckpointingLayer
@@ -44,7 +45,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
44
45
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
45
46
  from ...processing_utils import Unpack
46
47
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
47
- from ...utils.generic import check_model_inputs
48
+ from ...utils.generic import check_model_inputs, maybe_autocast
48
49
  from .configuration_cohere import CohereConfig
49
50
 
50
51
 
@@ -82,7 +83,7 @@ class CohereRotaryEmbedding(nn.Module):
82
83
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
83
84
 
84
85
  self.register_buffer("inv_freq", inv_freq, persistent=False)
85
- self.original_inv_freq = inv_freq
86
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
86
87
 
87
88
  @staticmethod
88
89
  def compute_default_rope_parameters(
@@ -121,7 +122,7 @@ class CohereRotaryEmbedding(nn.Module):
121
122
  position_ids_expanded = position_ids[:, None, :].float()
122
123
 
123
124
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
124
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
125
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
125
126
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
126
127
  emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
127
128
  cos = emb.cos() * self.attention_scaling
@@ -222,6 +223,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
222
223
  return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
223
224
 
224
225
 
226
+ @use_kernelized_func(apply_rotary_pos_emb)
225
227
  class CohereAttention(nn.Module):
226
228
  """Multi-headed attention from 'Attention Is All You Need' paper"""
227
229
 
@@ -247,7 +249,6 @@ class CohereAttention(nn.Module):
247
249
  self.o_proj = nn.Linear(
248
250
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
249
251
  )
250
- self.rotary_fn = apply_rotary_pos_emb
251
252
  self.use_qk_norm = config.use_qk_norm
252
253
  if self.use_qk_norm:
253
254
  # When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads
@@ -36,6 +36,7 @@ from ...modeling_rope_utils import dynamic_rope_update
36
36
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
37
37
  from ...processing_utils import Unpack
38
38
  from ...utils import TransformersKwargs, logging
39
+ from ...utils.generic import maybe_autocast
39
40
  from ..llama.modeling_llama import (
40
41
  LlamaAttention,
41
42
  LlamaForCausalLM,
@@ -75,7 +76,7 @@ class CohereRotaryEmbedding(LlamaRotaryEmbedding):
75
76
  position_ids_expanded = position_ids[:, None, :].float()
76
77
 
77
78
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
78
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
79
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
79
80
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
80
81
  emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
81
82
  cos = emb.cos() * self.attention_scaling
@@ -100,21 +100,23 @@ class CohereTokenizer(TokenizersBackend):
100
100
  Whether or not the default system prompt for Cohere tokenizer should be used.
101
101
  add_prefix_space (`bool`, *optional*, defaults to `False`):
102
102
  Whether or not the tokenizer should automatically add a prefix space
103
- vocab (`dict`, *optional*):
103
+ vocab (`str`, `dict` or `list`, *optional*):
104
104
  Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
105
- merges (`list`, *optional*):
106
- Custom merges list. If not provided, merges are loaded from merges_file.
105
+ merges (`str` or `list[str]`, *optional*):
106
+ Custom merges list. If not provided, merges are loaded from `merges_file`.
107
107
  """
108
108
 
109
109
  vocab_files_names = VOCAB_FILES_NAMES
110
110
  pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
111
111
  padding_side = "left"
112
112
  model_input_names = ["input_ids", "attention_mask"]
113
- slow_tokenizer_class = None
113
+ model = BPE
114
114
  # No `max_model_input_sizes`
115
115
 
116
116
  def __init__(
117
117
  self,
118
+ vocab: Optional[Union[str, dict[str, int]]] = None,
119
+ merges: Optional[Union[str, list[str]]] = None,
118
120
  errors: str = "replace",
119
121
  unk_token: str = "<UNK>",
120
122
  bos_token: str = "<BOS_TOKEN>",
@@ -123,27 +125,19 @@ class CohereTokenizer(TokenizersBackend):
123
125
  cls_token: str = "<CLS>",
124
126
  sep_token: str = "<SEP>",
125
127
  mask_token: str = "<MASK_TOKEN>",
126
- add_bos_token: bool = True,
127
- add_eos_token: bool = False,
128
128
  use_default_system_prompt: bool = False,
129
129
  add_prefix_space: bool = False,
130
- vocab: Optional[dict] = None,
131
- merges: Optional[list] = None,
132
130
  **kwargs,
133
131
  ):
134
- self._add_bos_token = add_bos_token
135
- self._add_eos_token = add_eos_token
136
132
  self.use_default_system_prompt = use_default_system_prompt
137
133
  self.add_prefix_space = add_prefix_space
138
134
  self.grounded_generation_template = kwargs.pop("grounded_generation_template", None)
139
135
  self.tool_use_template = kwargs.pop("tool_use_template", None)
140
136
 
141
- if vocab is not None:
142
- self._vocab = (
143
- {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
144
- )
145
- else:
146
- self._vocab = {
137
+ self._vocab = (
138
+ vocab
139
+ if vocab is not None
140
+ else {
147
141
  str(pad_token): 0,
148
142
  str(unk_token): 1,
149
143
  str(cls_token): 2,
@@ -151,12 +145,9 @@ class CohereTokenizer(TokenizersBackend):
151
145
  str(mask_token): 4,
152
146
  str(bos_token): 5,
153
147
  }
148
+ )
154
149
 
155
- if merges is not None:
156
- self._merges = merges
157
- else:
158
- self._merges = []
159
-
150
+ self._merges = merges or []
160
151
  self._tokenizer = Tokenizer(
161
152
  BPE(
162
153
  vocab=self._vocab,
@@ -177,10 +168,7 @@ class CohereTokenizer(TokenizersBackend):
177
168
  )
178
169
  self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=add_prefix_space, trim_offsets=True)
179
170
 
180
- tokenizer_object = self._tokenizer
181
-
182
171
  super().__init__(
183
- tokenizer_object=tokenizer_object,
184
172
  errors=errors,
185
173
  unk_token=unk_token,
186
174
  bos_token=bos_token,
@@ -189,8 +177,6 @@ class CohereTokenizer(TokenizersBackend):
189
177
  cls_token=cls_token,
190
178
  sep_token=sep_token,
191
179
  mask_token=mask_token,
192
- add_bos_token=add_bos_token,
193
- add_eos_token=add_eos_token,
194
180
  use_default_system_prompt=use_default_system_prompt,
195
181
  add_prefix_space=add_prefix_space,
196
182
  **kwargs,
@@ -198,22 +184,6 @@ class CohereTokenizer(TokenizersBackend):
198
184
 
199
185
  self._post_init()
200
186
 
201
- def _post_init(self):
202
- """Post-initialization to ensure add_prefix_space is applied correctly."""
203
- # Re-apply add_prefix_space setting to pre_tokenizer and decoder
204
- # This is needed because when loading from pretrained, the tokenizer.json
205
- # has these settings baked in and we need to override them
206
- self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
207
- [
208
- pre_tokenizers.Digits(individual_digits=True),
209
- pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, trim_offsets=True),
210
- ]
211
- )
212
- self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=self.add_prefix_space, trim_offsets=True)
213
-
214
- # Call parent to handle AddedToken properties
215
- super()._post_init()
216
-
217
187
  def apply_tool_use_template(
218
188
  self,
219
189
  conversation: list[dict[str, str]],
@@ -28,15 +28,15 @@ import torch.nn as nn
28
28
  from ...activations import ACT2FN
29
29
  from ...cache_utils import Cache, DynamicCache
30
30
  from ...generation import GenerationMixin
31
+ from ...integrations import use_kernelized_func
31
32
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
32
- from ...modeling_flash_attention_utils import FlashAttentionKwargs
33
33
  from ...modeling_layers import GradientCheckpointingLayer
34
34
  from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
35
35
  from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
36
36
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
37
37
  from ...processing_utils import Unpack
38
38
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
39
- from ...utils.generic import check_model_inputs
39
+ from ...utils.generic import check_model_inputs, maybe_autocast
40
40
  from .configuration_cohere2 import Cohere2Config
41
41
 
42
42
 
@@ -57,7 +57,7 @@ class Cohere2RotaryEmbedding(nn.Module):
57
57
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
58
58
 
59
59
  self.register_buffer("inv_freq", inv_freq, persistent=False)
60
- self.original_inv_freq = inv_freq
60
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
61
61
 
62
62
  @staticmethod
63
63
  def compute_default_rope_parameters(
@@ -96,7 +96,7 @@ class Cohere2RotaryEmbedding(nn.Module):
96
96
  position_ids_expanded = position_ids[:, None, :].float()
97
97
 
98
98
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
99
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
99
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
100
100
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
101
101
  emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
102
102
  cos = emb.cos() * self.attention_scaling
@@ -198,6 +198,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
198
198
  return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
199
199
 
200
200
 
201
+ @use_kernelized_func(apply_rotary_pos_emb)
201
202
  class Cohere2Attention(nn.Module):
202
203
  """Multi-headed attention from 'Attention Is All You Need' paper"""
203
204
 
@@ -233,7 +234,7 @@ class Cohere2Attention(nn.Module):
233
234
  attention_mask: Optional[torch.Tensor],
234
235
  past_key_values: Optional[Cache] = None,
235
236
  cache_position: Optional[torch.LongTensor] = None,
236
- **kwargs: Unpack[FlashAttentionKwargs],
237
+ **kwargs: Unpack[TransformersKwargs],
237
238
  ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
238
239
  input_shape = hidden_states.shape[:-1]
239
240
  hidden_shape = (*input_shape, -1, self.head_dim)
@@ -304,7 +305,7 @@ class Cohere2DecoderLayer(GradientCheckpointingLayer):
304
305
  past_key_values: Optional[Cache] = None,
305
306
  use_cache: Optional[bool] = False,
306
307
  cache_position: Optional[torch.LongTensor] = None,
307
- **kwargs: Unpack[FlashAttentionKwargs],
308
+ **kwargs: Unpack[TransformersKwargs],
308
309
  ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
309
310
  """
310
311
  Args:
@@ -398,7 +399,7 @@ class Cohere2Model(Cohere2PreTrainedModel):
398
399
  if inputs_embeds is None:
399
400
  inputs_embeds = self.embed_tokens(input_ids)
400
401
 
401
- if use_cache and past_key_values is None and not self.training:
402
+ if use_cache and past_key_values is None:
402
403
  past_key_values = DynamicCache(config=self.config)
403
404
 
404
405
  if cache_position is None:
@@ -22,7 +22,6 @@ import torch.nn as nn
22
22
  from ...cache_utils import Cache, DynamicCache
23
23
  from ...configuration_utils import PreTrainedConfig, layer_type_validation
24
24
  from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
25
- from ...modeling_flash_attention_utils import FlashAttentionKwargs
26
25
  from ...modeling_outputs import BaseModelOutputWithPast
27
26
  from ...modeling_rope_utils import (
28
27
  RopeParameters,
@@ -31,6 +30,7 @@ from ...modeling_rope_utils import (
31
30
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
32
31
  from ...processing_utils import Unpack
33
32
  from ...utils import TransformersKwargs, logging
33
+ from ...utils.generic import maybe_autocast
34
34
  from ..cohere.modeling_cohere import (
35
35
  CohereAttention,
36
36
  CohereDecoderLayer,
@@ -223,7 +223,7 @@ class Cohere2RotaryEmbedding(CohereRotaryEmbedding):
223
223
  position_ids_expanded = position_ids[:, None, :].float()
224
224
 
225
225
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
226
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
226
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
227
227
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
228
228
  emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat()
229
229
  cos = emb.cos() * self.attention_scaling
@@ -271,7 +271,7 @@ class Cohere2Attention(CohereAttention):
271
271
  attention_mask: Optional[torch.Tensor],
272
272
  past_key_values: Optional[Cache] = None,
273
273
  cache_position: Optional[torch.LongTensor] = None,
274
- **kwargs: Unpack[FlashAttentionKwargs],
274
+ **kwargs: Unpack[TransformersKwargs],
275
275
  ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
276
276
  input_shape = hidden_states.shape[:-1]
277
277
  hidden_shape = (*input_shape, -1, self.head_dim)
@@ -322,7 +322,7 @@ class Cohere2DecoderLayer(CohereDecoderLayer):
322
322
  past_key_values: Optional[Cache] = None,
323
323
  use_cache: Optional[bool] = False,
324
324
  cache_position: Optional[torch.LongTensor] = None,
325
- **kwargs: Unpack[FlashAttentionKwargs],
325
+ **kwargs: Unpack[TransformersKwargs],
326
326
  ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
327
327
  residual = hidden_states
328
328
  hidden_states = self.input_layernorm(hidden_states)
@@ -367,7 +367,7 @@ class Cohere2Model(Gemma2Model):
367
367
  if inputs_embeds is None:
368
368
  inputs_embeds = self.embed_tokens(input_ids)
369
369
 
370
- if use_cache and past_key_values is None and not self.training:
370
+ if use_cache and past_key_values is None:
371
371
  past_key_values = DynamicCache(config=self.config)
372
372
 
373
373
  if cache_position is None:
@@ -93,8 +93,9 @@ def get_optimal_tiled_canvas(
93
93
  patch_size_height, patch_size_width = target_tile_size # (height == width)
94
94
 
95
95
  candidate_resolutions = np.array(possible_resolutions) * patch_size_height
96
- original_size = np.stack([image_height, image_width])
97
- required_scales = candidate_resolutions / original_size
96
+ # tiles following (width, height) order to align with aspect ratio convention
97
+ tile_size = np.stack([image_width, image_height])
98
+ required_scales = candidate_resolutions / tile_size
98
99
  required_scale = np.min(required_scales, axis=-1, keepdims=True) # [n_resolutions, 1]
99
100
  if np.all(required_scale < 1):
100
101
  # We are forced to downscale, so try to minimize the amount of downscaling
@@ -103,7 +104,7 @@ def get_optimal_tiled_canvas(
103
104
  # Pick the resolution that required the least upscaling so that it most closely fits the image
104
105
  required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
105
106
  best_grid = possible_resolutions[np.argmin(required_scale)]
106
- return best_grid
107
+ return best_grid # (width, height)
107
108
 
108
109
 
109
110
  @auto_docstring
@@ -262,7 +263,6 @@ class Cohere2VisionImageProcessorFast(BaseImageProcessorFast):
262
263
  processed_images_grouped[shape] = stacked_images
263
264
 
264
265
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
265
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
266
266
 
267
267
  return BatchFeature(
268
268
  data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
@@ -376,6 +376,7 @@ class Cohere2VisionForConditionalGeneration(Cohere2VisionPreTrainedModel, Genera
376
376
  attention_mask=None,
377
377
  cache_position=None,
378
378
  logits_to_keep=None,
379
+ is_first_iteration=False,
379
380
  **kwargs,
380
381
  ):
381
382
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -387,12 +388,15 @@ class Cohere2VisionForConditionalGeneration(Cohere2VisionPreTrainedModel, Genera
387
388
  attention_mask=attention_mask,
388
389
  cache_position=cache_position,
389
390
  logits_to_keep=logits_to_keep,
391
+ is_first_iteration=is_first_iteration,
390
392
  **kwargs,
391
393
  )
392
394
 
393
- if cache_position[0] == 0:
394
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
395
- # Otherwise we need pixel values to be passed to model
395
+ if is_first_iteration or not kwargs.get("use_cache", True):
396
+ # Pixel values are used only in the first iteration if available
397
+ # In subsquent iterations, they are already merged with text and cached
398
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
399
+ # iteration with a question and cached system prompt (continue generate from cache)
396
400
  model_inputs["pixel_values"] = pixel_values
397
401
 
398
402
  return model_inputs
@@ -295,8 +295,9 @@ def get_optimal_tiled_canvas(
295
295
  patch_size_height, patch_size_width = target_tile_size # (height == width)
296
296
 
297
297
  candidate_resolutions = np.array(possible_resolutions) * patch_size_height
298
- original_size = np.stack([image_height, image_width])
299
- required_scales = candidate_resolutions / original_size
298
+ # tiles following (width, height) order to align with aspect ratio convention
299
+ tile_size = np.stack([image_width, image_height])
300
+ required_scales = candidate_resolutions / tile_size
300
301
  required_scale = np.min(required_scales, axis=-1, keepdims=True) # [n_resolutions, 1]
301
302
  if np.all(required_scale < 1):
302
303
  # We are forced to downscale, so try to minimize the amount of downscaling
@@ -305,7 +306,7 @@ def get_optimal_tiled_canvas(
305
306
  # Pick the resolution that required the least upscaling so that it most closely fits the image
306
307
  required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
307
308
  best_grid = possible_resolutions[np.argmin(required_scale)]
308
- return best_grid
309
+ return best_grid # (width, height)
309
310
 
310
311
 
311
312
  class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False):
@@ -141,6 +141,7 @@ class ColQwen2ForRetrieval(ColQwen2PreTrainedModel):
141
141
  pixel_values: Optional[torch.Tensor] = None,
142
142
  image_grid_thw: Optional[torch.LongTensor] = None,
143
143
  cache_position: Optional[torch.LongTensor] = None,
144
+ **kwargs,
144
145
  ) -> ColQwen2ForRetrievalOutput:
145
146
  r"""
146
147
  image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
@@ -322,6 +322,7 @@ class ColQwen2ForRetrieval(ColPaliForRetrieval):
322
322
  pixel_values: Optional[torch.Tensor] = None,
323
323
  image_grid_thw: Optional[torch.LongTensor] = None,
324
324
  cache_position: Optional[torch.LongTensor] = None,
325
+ **kwargs,
325
326
  ) -> ColQwen2ForRetrievalOutput:
326
327
  r"""
327
328
  image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
@@ -37,7 +37,7 @@ class ConditionalDetrConfig(PreTrainedConfig):
37
37
  use_timm_backbone (`bool`, *optional*, defaults to `True`):
38
38
  Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
39
39
  API.
40
- backbone_config (`PreTrainedConfig` or `dict`, *optional*):
40
+ backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `ResNetConfig()`):
41
41
  The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which
42
42
  case it will default to `ResNetConfig()`.
43
43
  num_channels (`int`, *optional*, defaults to 3):
@@ -984,7 +984,7 @@ class ConditionalDetrPreTrainedModel(PreTrainedModel):
984
984
  elif isinstance(module, ConditionalDetrLearnedPositionEmbedding):
985
985
  init.uniform_(module.row_embeddings.weight)
986
986
  init.uniform_(module.column_embeddings.weight)
987
- if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
987
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
988
988
  init.normal_(module.weight, mean=0.0, std=std)
989
989
  if module.bias is not None:
990
990
  init.zeros_(module.bias)
@@ -993,6 +993,9 @@ class ConditionalDetrPreTrainedModel(PreTrainedModel):
993
993
  # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
994
994
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
995
995
  init.zeros_(module.weight[module.padding_idx])
996
+ elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
997
+ init.ones_(module.weight)
998
+ init.zeros_(module.bias)
996
999
 
997
1000
 
998
1001
  # Copied from transformers.models.detr.modeling_detr.DetrEncoder with Detr->ConditionalDetr,DETR->ConditionalDETR
@@ -1032,6 +1035,7 @@ class ConditionalDetrEncoder(ConditionalDetrPreTrainedModel):
1032
1035
  output_attentions=None,
1033
1036
  output_hidden_states=None,
1034
1037
  return_dict=None,
1038
+ **kwargs,
1035
1039
  ):
1036
1040
  r"""
1037
1041
  Args:
@@ -1156,6 +1160,7 @@ class ConditionalDetrDecoder(ConditionalDetrPreTrainedModel):
1156
1160
  output_attentions=None,
1157
1161
  output_hidden_states=None,
1158
1162
  return_dict=None,
1163
+ **kwargs,
1159
1164
  ):
1160
1165
  r"""
1161
1166
  Args:
@@ -1344,6 +1349,7 @@ class ConditionalDetrModel(ConditionalDetrPreTrainedModel):
1344
1349
  output_attentions: Optional[bool] = None,
1345
1350
  output_hidden_states: Optional[bool] = None,
1346
1351
  return_dict: Optional[bool] = None,
1352
+ **kwargs,
1347
1353
  ) -> Union[tuple[torch.FloatTensor], ConditionalDetrModelOutput]:
1348
1354
  r"""
1349
1355
  decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
@@ -1529,6 +1535,7 @@ class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
1529
1535
  output_attentions: Optional[bool] = None,
1530
1536
  output_hidden_states: Optional[bool] = None,
1531
1537
  return_dict: Optional[bool] = None,
1538
+ **kwargs,
1532
1539
  ) -> Union[tuple[torch.FloatTensor], ConditionalDetrObjectDetectionOutput]:
1533
1540
  r"""
1534
1541
  decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
@@ -1693,6 +1700,7 @@ class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel):
1693
1700
  output_attentions: Optional[bool] = None,
1694
1701
  output_hidden_states: Optional[bool] = None,
1695
1702
  return_dict: Optional[bool] = None,
1703
+ **kwargs,
1696
1704
  ) -> Union[tuple[torch.FloatTensor], ConditionalDetrSegmentationOutput]:
1697
1705
  r"""
1698
1706
  decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
@@ -118,6 +118,9 @@ class ConvBertPreTrainedModel(PreTrainedModel):
118
118
  elif isinstance(module, GroupedLinearLayer):
119
119
  init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
120
120
  init.zeros_(module.bias)
121
+ elif isinstance(module, ConvBertEmbeddings):
122
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
123
+ init.zeros_(module.token_type_ids)
121
124
 
122
125
 
123
126
  class SeparableConv1D(nn.Module):
@@ -629,6 +632,7 @@ class ConvBertModel(ConvBertPreTrainedModel):
629
632
  output_attentions: Optional[bool] = None,
630
633
  output_hidden_states: Optional[bool] = None,
631
634
  return_dict: Optional[bool] = None,
635
+ **kwargs,
632
636
  ) -> Union[tuple, BaseModelOutputWithCrossAttentions]:
633
637
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
634
638
  output_hidden_states = (
@@ -729,6 +733,7 @@ class ConvBertForMaskedLM(ConvBertPreTrainedModel):
729
733
  output_attentions: Optional[bool] = None,
730
734
  output_hidden_states: Optional[bool] = None,
731
735
  return_dict: Optional[bool] = None,
736
+ **kwargs,
732
737
  ) -> Union[tuple, MaskedLMOutput]:
733
738
  r"""
734
739
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -824,6 +829,7 @@ class ConvBertForSequenceClassification(ConvBertPreTrainedModel):
824
829
  output_attentions: Optional[bool] = None,
825
830
  output_hidden_states: Optional[bool] = None,
826
831
  return_dict: Optional[bool] = None,
832
+ **kwargs,
827
833
  ) -> Union[tuple, SequenceClassifierOutput]:
828
834
  r"""
829
835
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -906,6 +912,7 @@ class ConvBertForMultipleChoice(ConvBertPreTrainedModel):
906
912
  output_attentions: Optional[bool] = None,
907
913
  output_hidden_states: Optional[bool] = None,
908
914
  return_dict: Optional[bool] = None,
915
+ **kwargs,
909
916
  ) -> Union[tuple, MultipleChoiceModelOutput]:
910
917
  r"""
911
918
  input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
@@ -1013,6 +1020,7 @@ class ConvBertForTokenClassification(ConvBertPreTrainedModel):
1013
1020
  output_attentions: Optional[bool] = None,
1014
1021
  output_hidden_states: Optional[bool] = None,
1015
1022
  return_dict: Optional[bool] = None,
1023
+ **kwargs,
1016
1024
  ) -> Union[tuple, TokenClassifierOutput]:
1017
1025
  r"""
1018
1026
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1078,6 +1086,7 @@ class ConvBertForQuestionAnswering(ConvBertPreTrainedModel):
1078
1086
  output_attentions: Optional[bool] = None,
1079
1087
  output_hidden_states: Optional[bool] = None,
1080
1088
  return_dict: Optional[bool] = None,
1089
+ **kwargs,
1081
1090
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
1082
1091
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1083
1092
 
@@ -78,7 +78,7 @@ class ConvNextImageProcessor(BaseImageProcessor):
78
78
  crop_pct (`float` *optional*, defaults to 224 / 256):
79
79
  Percentage of the image to crop. Only has an effect if `do_resize` is `True` and size < 384. Can be
80
80
  overridden by `crop_pct` in the `preprocess` method.
81
- resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
81
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
82
82
  Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
83
83
  do_rescale (`bool`, *optional*, defaults to `True`):
84
84
  Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
@@ -105,7 +105,7 @@ class ConvNextImageProcessor(BaseImageProcessor):
105
105
  do_resize: bool = True,
106
106
  size: Optional[dict[str, int]] = None,
107
107
  crop_pct: Optional[float] = None,
108
- resample: PILImageResampling = PILImageResampling.BILINEAR,
108
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
109
109
  do_rescale: bool = True,
110
110
  rescale_factor: Union[int, float] = 1 / 255,
111
111
  do_normalize: bool = True,