transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -522,6 +522,14 @@ class BrosPreTrainedModel(PreTrainedModel):
522
522
  std = self.config.initializer_range
523
523
  if isinstance(module, BrosRelationExtractor):
524
524
  init.normal_(module.dummy_node, std=std)
525
+ elif isinstance(module, BrosTextEmbeddings):
526
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
527
+ init.zeros_(module.token_type_ids)
528
+ elif isinstance(module, BrosPositionalEmbedding1D):
529
+ inv_freq = 1 / (
530
+ 10000 ** (torch.arange(0.0, module.dim_bbox_sinusoid_emb_1d, 2.0) / module.dim_bbox_sinusoid_emb_1d)
531
+ )
532
+ init.copy_(module.inv_freq, inv_freq)
525
533
 
526
534
 
527
535
  @auto_docstring
@@ -563,6 +571,7 @@ class BrosModel(BrosPreTrainedModel):
563
571
  output_attentions: Optional[bool] = None,
564
572
  output_hidden_states: Optional[bool] = None,
565
573
  return_dict: Optional[bool] = None,
574
+ **kwargs,
566
575
  ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
567
576
  r"""
568
577
  bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
@@ -701,6 +710,7 @@ class BrosForTokenClassification(BrosPreTrainedModel):
701
710
  output_attentions: Optional[bool] = None,
702
711
  output_hidden_states: Optional[bool] = None,
703
712
  return_dict: Optional[bool] = None,
713
+ **kwargs,
704
714
  ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
705
715
  r"""
706
716
  bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
@@ -821,6 +831,7 @@ class BrosSpadeEEForTokenClassification(BrosPreTrainedModel):
821
831
  output_attentions: Optional[bool] = None,
822
832
  output_hidden_states: Optional[bool] = None,
823
833
  return_dict: Optional[bool] = None,
834
+ **kwargs,
824
835
  ) -> Union[tuple[torch.Tensor], BrosSpadeOutput]:
825
836
  r"""
826
837
  bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
@@ -957,6 +968,7 @@ class BrosSpadeELForTokenClassification(BrosPreTrainedModel):
957
968
  output_attentions: Optional[bool] = None,
958
969
  output_hidden_states: Optional[bool] = None,
959
970
  return_dict: Optional[bool] = None,
971
+ **kwargs,
960
972
  ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
961
973
  r"""
962
974
  bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
@@ -54,6 +54,112 @@ from .configuration_camembert import CamembertConfig
54
54
  logger = logging.get_logger(__name__)
55
55
 
56
56
 
57
+ class CamembertEmbeddings(nn.Module):
58
+ """Construct the embeddings from word, position and token_type embeddings."""
59
+
60
+ def __init__(self, config):
61
+ super().__init__()
62
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
63
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
64
+
65
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
66
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
67
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
68
+ self.register_buffer(
69
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
70
+ )
71
+ self.register_buffer(
72
+ "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
73
+ )
74
+
75
+ self.padding_idx = config.pad_token_id
76
+ self.position_embeddings = nn.Embedding(
77
+ config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
78
+ )
79
+
80
+ def forward(
81
+ self,
82
+ input_ids: Optional[torch.LongTensor] = None,
83
+ token_type_ids: Optional[torch.LongTensor] = None,
84
+ position_ids: Optional[torch.LongTensor] = None,
85
+ inputs_embeds: Optional[torch.FloatTensor] = None,
86
+ past_key_values_length: int = 0,
87
+ ) -> torch.Tensor:
88
+ if position_ids is None:
89
+ if input_ids is not None:
90
+ # Create the position ids from the input token ids. Any padded tokens remain padded.
91
+ position_ids = self.create_position_ids_from_input_ids(
92
+ input_ids, self.padding_idx, past_key_values_length
93
+ )
94
+ else:
95
+ position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, self.padding_idx)
96
+
97
+ if input_ids is not None:
98
+ input_shape = input_ids.size()
99
+ else:
100
+ input_shape = inputs_embeds.size()[:-1]
101
+
102
+ batch_size, seq_length = input_shape
103
+
104
+ # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
105
+ # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
106
+ # issue #5664
107
+ if token_type_ids is None:
108
+ if hasattr(self, "token_type_ids"):
109
+ # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
110
+ buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
111
+ buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
112
+ token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
113
+ else:
114
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
115
+
116
+ if inputs_embeds is None:
117
+ inputs_embeds = self.word_embeddings(input_ids)
118
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
119
+ embeddings = inputs_embeds + token_type_embeddings
120
+
121
+ position_embeddings = self.position_embeddings(position_ids)
122
+ embeddings = embeddings + position_embeddings
123
+
124
+ embeddings = self.LayerNorm(embeddings)
125
+ embeddings = self.dropout(embeddings)
126
+ return embeddings
127
+
128
+ @staticmethod
129
+ def create_position_ids_from_inputs_embeds(inputs_embeds, padding_idx):
130
+ """
131
+ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
132
+
133
+ Args:
134
+ inputs_embeds: torch.Tensor
135
+
136
+ Returns: torch.Tensor
137
+ """
138
+ input_shape = inputs_embeds.size()[:-1]
139
+ sequence_length = input_shape[1]
140
+
141
+ position_ids = torch.arange(
142
+ padding_idx + 1, sequence_length + padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
143
+ )
144
+ return position_ids.unsqueeze(0).expand(input_shape)
145
+
146
+ @staticmethod
147
+ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
148
+ """
149
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
150
+ are ignored. This is modified from fairseq's `utils.make_positions`.
151
+
152
+ Args:
153
+ x: torch.Tensor x:
154
+
155
+ Returns: torch.Tensor
156
+ """
157
+ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
158
+ mask = input_ids.ne(padding_idx).int()
159
+ incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
160
+ return incremental_indices.long() + padding_idx
161
+
162
+
57
163
  def eager_attention_forward(
58
164
  module: nn.Module,
59
165
  query: torch.Tensor,
@@ -417,112 +523,9 @@ class CamembertPreTrainedModel(PreTrainedModel):
417
523
  super()._init_weights(module)
418
524
  if isinstance(module, CamembertLMHead):
419
525
  init.zeros_(module.bias)
420
-
421
-
422
- class CamembertEmbeddings(nn.Module):
423
- """Construct the embeddings from word, position and token_type embeddings."""
424
-
425
- def __init__(self, config):
426
- super().__init__()
427
- self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
428
- self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
429
-
430
- self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
431
- self.dropout = nn.Dropout(config.hidden_dropout_prob)
432
- # position_ids (1, len position emb) is contiguous in memory and exported when serialized
433
- self.register_buffer(
434
- "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
435
- )
436
- self.register_buffer(
437
- "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
438
- )
439
-
440
- self.padding_idx = config.pad_token_id
441
- self.position_embeddings = nn.Embedding(
442
- config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
443
- )
444
-
445
- def forward(
446
- self,
447
- input_ids: Optional[torch.LongTensor] = None,
448
- token_type_ids: Optional[torch.LongTensor] = None,
449
- position_ids: Optional[torch.LongTensor] = None,
450
- inputs_embeds: Optional[torch.FloatTensor] = None,
451
- past_key_values_length: int = 0,
452
- ) -> torch.Tensor:
453
- if position_ids is None:
454
- if input_ids is not None:
455
- # Create the position ids from the input token ids. Any padded tokens remain padded.
456
- position_ids = self.create_position_ids_from_input_ids(
457
- input_ids, self.padding_idx, past_key_values_length
458
- )
459
- else:
460
- position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, self.padding_idx)
461
-
462
- if input_ids is not None:
463
- input_shape = input_ids.size()
464
- else:
465
- input_shape = inputs_embeds.size()[:-1]
466
-
467
- batch_size, seq_length = input_shape
468
-
469
- # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
470
- # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
471
- # issue #5664
472
- if token_type_ids is None:
473
- if hasattr(self, "token_type_ids"):
474
- # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
475
- buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
476
- buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
477
- token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
478
- else:
479
- token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
480
-
481
- if inputs_embeds is None:
482
- inputs_embeds = self.word_embeddings(input_ids)
483
- token_type_embeddings = self.token_type_embeddings(token_type_ids)
484
- embeddings = inputs_embeds + token_type_embeddings
485
-
486
- position_embeddings = self.position_embeddings(position_ids)
487
- embeddings = embeddings + position_embeddings
488
-
489
- embeddings = self.LayerNorm(embeddings)
490
- embeddings = self.dropout(embeddings)
491
- return embeddings
492
-
493
- @staticmethod
494
- def create_position_ids_from_inputs_embeds(inputs_embeds, padding_idx):
495
- """
496
- We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
497
-
498
- Args:
499
- inputs_embeds: torch.Tensor
500
-
501
- Returns: torch.Tensor
502
- """
503
- input_shape = inputs_embeds.size()[:-1]
504
- sequence_length = input_shape[1]
505
-
506
- position_ids = torch.arange(
507
- padding_idx + 1, sequence_length + padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
508
- )
509
- return position_ids.unsqueeze(0).expand(input_shape)
510
-
511
- @staticmethod
512
- def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
513
- """
514
- Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
515
- are ignored. This is modified from fairseq's `utils.make_positions`.
516
-
517
- Args:
518
- x: torch.Tensor x:
519
-
520
- Returns: torch.Tensor
521
- """
522
- # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
523
- mask = input_ids.ne(padding_idx).int()
524
- incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
525
- return incremental_indices.long() + padding_idx
526
+ elif isinstance(module, CamembertEmbeddings):
527
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
528
+ init.zeros_(module.token_type_ids)
526
529
 
527
530
 
528
531
  class CamembertEncoder(nn.Module):
@@ -14,6 +14,8 @@
14
14
  # limitations under the License
15
15
  """Tokenization classes for Camembert model."""
16
16
 
17
+ from typing import Optional, Union
18
+
17
19
  from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
18
20
  from tokenizers.models import Unigram
19
21
 
@@ -83,7 +85,7 @@ class CamembertTokenizer(TokenizersBackend):
83
85
  vocab_file (`str`, *optional*):
84
86
  [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
85
87
  contains the vocabulary necessary to instantiate a tokenizer.
86
- vocab (`dict`, *optional*):
88
+ vocab (`str`, `dict` or `list`, *optional*):
87
89
  Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
88
90
  """
89
91
 
@@ -103,7 +105,7 @@ class CamembertTokenizer(TokenizersBackend):
103
105
  additional_special_tokens=None,
104
106
  add_prefix_space=True,
105
107
  vocab_file=None,
106
- vocab=None,
108
+ vocab: Optional[Union[str, dict, list]] = None,
107
109
  **kwargs,
108
110
  ):
109
111
  self.vocab_file = vocab_file
@@ -114,9 +116,9 @@ class CamembertTokenizer(TokenizersBackend):
114
116
  if additional_special_tokens is None:
115
117
  additional_special_tokens = ["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"]
116
118
 
117
- if vocab is not None and isinstance(vocab, list):
118
- self._vocab = list(vocab)
119
- unk_index = next(i for i, (tok, _) in enumerate(self._vocab) if tok == str(unk_token))
119
+ if vocab is not None:
120
+ self._vocab = vocab
121
+ unk_index = next((i for i, (tok, _) in enumerate(self._vocab) if tok == str(unk_token)), 0)
120
122
  self._tokenizer = Tokenizer(Unigram(self._vocab, unk_id=unk_index, byte_fallback=False))
121
123
  else:
122
124
  self._vocab = [
@@ -131,11 +133,8 @@ class CamembertTokenizer(TokenizersBackend):
131
133
 
132
134
  self._tokenizer.normalizer = normalizers.Sequence(
133
135
  [
134
- normalizers.Replace("\n", " "),
135
- normalizers.Replace("\r", " "),
136
- normalizers.Replace("\t", " "),
136
+ normalizers.Replace(Regex(r"\s{2,}|[\n\r\t]"), " "),
137
137
  normalizers.Strip(left=False, right=True),
138
- normalizers.Replace(Regex(" {2,}"), "▁"),
139
138
  ]
140
139
  )
141
140
 
@@ -143,10 +142,7 @@ class CamembertTokenizer(TokenizersBackend):
143
142
  self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
144
143
  self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
145
144
 
146
- tokenizer_object = self._tokenizer
147
-
148
145
  super().__init__(
149
- tokenizer_object=tokenizer_object,
150
146
  bos_token=bos_token,
151
147
  eos_token=eos_token,
152
148
  sep_token=sep_token,
@@ -23,6 +23,7 @@ import torch
23
23
  from torch import nn
24
24
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
25
25
 
26
+ from ... import initialization as init
26
27
  from ...activations import ACT2FN
27
28
  from ...modeling_layers import GradientCheckpointingLayer
28
29
  from ...modeling_outputs import (
@@ -719,6 +720,11 @@ class CaninePreTrainedModel(PreTrainedModel):
719
720
  base_model_prefix = "canine"
720
721
  supports_gradient_checkpointing = True
721
722
 
723
+ def _init_weights(self, module):
724
+ super()._init_weights(module)
725
+ if isinstance(module, CanineEmbeddings):
726
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
727
+
722
728
 
723
729
  @auto_docstring
724
730
  class CanineModel(CaninePreTrainedModel):
@@ -836,6 +842,7 @@ class CanineModel(CaninePreTrainedModel):
836
842
  output_attentions: Optional[bool] = None,
837
843
  output_hidden_states: Optional[bool] = None,
838
844
  return_dict: Optional[bool] = None,
845
+ **kwargs,
839
846
  ) -> Union[tuple, CanineModelOutputWithPooling]:
840
847
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
841
848
  output_hidden_states = (
@@ -1006,6 +1013,7 @@ class CanineForSequenceClassification(CaninePreTrainedModel):
1006
1013
  output_attentions: Optional[bool] = None,
1007
1014
  output_hidden_states: Optional[bool] = None,
1008
1015
  return_dict: Optional[bool] = None,
1016
+ **kwargs,
1009
1017
  ) -> Union[tuple, SequenceClassifierOutput]:
1010
1018
  r"""
1011
1019
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1089,6 +1097,7 @@ class CanineForMultipleChoice(CaninePreTrainedModel):
1089
1097
  output_attentions: Optional[bool] = None,
1090
1098
  output_hidden_states: Optional[bool] = None,
1091
1099
  return_dict: Optional[bool] = None,
1100
+ **kwargs,
1092
1101
  ) -> Union[tuple, MultipleChoiceModelOutput]:
1093
1102
  r"""
1094
1103
  input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
@@ -1192,6 +1201,7 @@ class CanineForTokenClassification(CaninePreTrainedModel):
1192
1201
  output_attentions: Optional[bool] = None,
1193
1202
  output_hidden_states: Optional[bool] = None,
1194
1203
  return_dict: Optional[bool] = None,
1204
+ **kwargs,
1195
1205
  ) -> Union[tuple, TokenClassifierOutput]:
1196
1206
  r"""
1197
1207
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1287,6 +1297,7 @@ class CanineForQuestionAnswering(CaninePreTrainedModel):
1287
1297
  output_attentions: Optional[bool] = None,
1288
1298
  output_hidden_states: Optional[bool] = None,
1289
1299
  return_dict: Optional[bool] = None,
1300
+ **kwargs,
1290
1301
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
1291
1302
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1292
1303
 
@@ -67,6 +67,8 @@ class CanineTokenizer(PreTrainedTokenizer):
67
67
  The maximum sentence length the model accepts.
68
68
  """
69
69
 
70
+ model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
71
+
70
72
  def __init__(
71
73
  self,
72
74
  bos_token=chr(CLS),
@@ -38,6 +38,7 @@ from ...utils import (
38
38
  can_return_tuple,
39
39
  logging,
40
40
  )
41
+ from ...utils.generic import maybe_autocast
41
42
  from .configuration_chameleon import ChameleonConfig, ChameleonVQVAEConfig
42
43
 
43
44
 
@@ -83,7 +84,7 @@ class ChameleonRotaryEmbedding(nn.Module):
83
84
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
84
85
 
85
86
  self.register_buffer("inv_freq", inv_freq, persistent=False)
86
- self.original_inv_freq = inv_freq
87
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
87
88
 
88
89
  @staticmethod
89
90
  def compute_default_rope_parameters(
@@ -122,7 +123,7 @@ class ChameleonRotaryEmbedding(nn.Module):
122
123
  position_ids_expanded = position_ids[:, None, :].float()
123
124
 
124
125
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
125
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
126
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
126
127
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
127
128
  emb = torch.cat((freqs, freqs), dim=-1)
128
129
  cos = emb.cos() * self.attention_scaling
@@ -808,6 +809,7 @@ class ChameleonVQVAE(ChameleonPreTrainedModel):
808
809
  self.quant_conv = torch.nn.Conv2d(config.latent_channels, config.embed_dim, 1)
809
810
  self.post_quant_conv = torch.nn.Conv2d(config.embed_dim, config.latent_channels, 1)
810
811
  self.eval() # Chameleon's VQ model is frozen
812
+ self.post_init()
811
813
 
812
814
  def encode(self, pixel_values: torch.LongTensor):
813
815
  hidden_states = self.encoder(pixel_values)
@@ -1121,6 +1123,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
1121
1123
  cache_position=None,
1122
1124
  position_ids=None,
1123
1125
  use_cache=True,
1126
+ is_first_iteration=False,
1124
1127
  **kwargs,
1125
1128
  ):
1126
1129
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -1134,12 +1137,15 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
1134
1137
  cache_position=cache_position,
1135
1138
  position_ids=position_ids,
1136
1139
  use_cache=use_cache,
1140
+ is_first_iteration=is_first_iteration,
1137
1141
  **kwargs,
1138
1142
  )
1139
1143
 
1140
- if cache_position[0] != 0:
1141
- # If we're in cached decoding stage, pixel values should be `None` because input ids do not contain special image token anymore
1142
- # Otherwise we need pixel values to be passed to model
1144
+ if not is_first_iteration and use_cache:
1145
+ # Pixel values are used only in the first iteration if available
1146
+ # In subsquent iterations, they are already merged with text and cached
1147
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
1148
+ # iteration with a question and cached system prompt (continue generate from cache)
1143
1149
  model_inputs["pixel_values"] = None
1144
1150
 
1145
1151
  return model_inputs
@@ -572,10 +572,13 @@ class ChineseCLIPPreTrainedModel(PreTrainedModel):
572
572
  init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
573
573
  init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
574
574
  init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
575
+ init.copy_(module.position_ids, torch.arange(module.num_positions).expand((1, -1)))
575
576
  elif isinstance(module, ChineseCLIPTextEmbeddings):
576
577
  init.normal_(module.word_embeddings.weight, mean=0.0, std=self.config.initializer_range)
577
578
  init.normal_(module.position_embeddings.weight, mean=0.0, std=self.config.initializer_range)
578
579
  init.normal_(module.token_type_embeddings.weight, mean=0.0, std=self.config.initializer_range)
580
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
581
+ init.zeros_(module.token_type_ids)
579
582
  for embedding in [module.word_embeddings, module.position_embeddings, module.token_type_embeddings]:
580
583
  if embedding.padding_idx is not None:
581
584
  init.zeros_(embedding.weight[embedding.padding_idx])
@@ -638,9 +641,9 @@ class ChineseCLIPTextEncoder(nn.Module):
638
641
  all_hidden_states = all_hidden_states + (hidden_states,)
639
642
 
640
643
  layer_outputs = layer_module(
641
- hidden_states=hidden_states,
642
- attention_mask=attention_mask,
643
- output_attentions=output_attentions,
644
+ hidden_states,
645
+ attention_mask,
646
+ output_attentions,
644
647
  **kwargs,
645
648
  )
646
649
 
@@ -839,6 +842,7 @@ class ChineseCLIPTextModel(ChineseCLIPPreTrainedModel):
839
842
  output_attentions: Optional[bool] = None,
840
843
  output_hidden_states: Optional[bool] = None,
841
844
  return_dict: Optional[bool] = None,
845
+ **kwargs,
842
846
  ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPooling]:
843
847
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
844
848
  output_hidden_states = (
@@ -926,6 +930,7 @@ class ChineseCLIPVisionModel(ChineseCLIPPreTrainedModel):
926
930
  output_hidden_states: Optional[bool] = None,
927
931
  interpolate_pos_encoding: bool = False,
928
932
  return_dict: Optional[bool] = None,
933
+ **kwargs,
929
934
  ) -> Union[tuple, BaseModelOutputWithPooling]:
930
935
  r"""
931
936
  Examples:
@@ -1091,6 +1096,7 @@ class ChineseCLIPModel(ChineseCLIPPreTrainedModel):
1091
1096
  output_hidden_states: Optional[bool] = None,
1092
1097
  interpolate_pos_encoding: bool = False,
1093
1098
  return_dict: Optional[bool] = None,
1099
+ **kwargs,
1094
1100
  ) -> Union[tuple, ChineseCLIPOutput]:
1095
1101
  r"""
1096
1102
  return_loss (`bool`, *optional*):
@@ -71,7 +71,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
71
71
  Truncation pattern for long audio inputs. Two patterns are available:
72
72
  - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a
73
73
  downsampled version of the entire mel spectrogram.
74
- If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy
74
+ If `config.fusion` is set to True, shorter audios also need to return 4 mels, which will just be a copy
75
75
  of the original mel obtained from the padded audio.
76
76
  - `rand_trunc` will select a random crop of the mel spectrogram.
77
77
  padding (`str`, *optional*, defaults to `"repeatpad"`):
@@ -279,7 +279,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
279
279
  Truncation pattern for long audio inputs. Two patterns are available:
280
280
  - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and
281
281
  a downsampled version of the entire mel spectrogram.
282
- If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a
282
+ If `config.fusion` is set to True, shorter audios also need to return 4 mels, which will just be a
283
283
  copy of the original mel obtained from the padded audio.
284
284
  - `rand_trunc` will select a random crop of the mel spectrogram.
285
285
  padding (`str`, *optional*):
@@ -365,18 +365,7 @@ class ClapAudioSelfAttention(nn.Module):
365
365
  torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
366
366
  )
367
367
 
368
- # get pair-wise relative position index for each token inside the window
369
- coords_h = torch.arange(self.window_size[0])
370
- coords_w = torch.arange(self.window_size[1])
371
- coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
372
- coords_flatten = torch.flatten(coords, 1)
373
- relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
374
- relative_coords = relative_coords.permute(1, 2, 0).contiguous()
375
- relative_coords[:, :, 0] += self.window_size[0] - 1
376
- relative_coords[:, :, 1] += self.window_size[1] - 1
377
- relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
378
- relative_position_index = relative_coords.sum(-1)
379
- self.register_buffer("relative_position_index", relative_position_index)
368
+ self.register_buffer("relative_position_index", self.create_relative_position_index())
380
369
 
381
370
  self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
382
371
  self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
@@ -435,6 +424,20 @@ class ClapAudioSelfAttention(nn.Module):
435
424
 
436
425
  return outputs
437
426
 
427
+ def create_relative_position_index(self):
428
+ # get pair-wise relative position index for each token inside the window
429
+ coords_h = torch.arange(self.window_size[0])
430
+ coords_w = torch.arange(self.window_size[1])
431
+ coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
432
+ coords_flatten = torch.flatten(coords, 1)
433
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
434
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous()
435
+ relative_coords[:, :, 0] += self.window_size[0] - 1
436
+ relative_coords[:, :, 1] += self.window_size[1] - 1
437
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
438
+ relative_position_index = relative_coords.sum(-1)
439
+ return relative_position_index
440
+
438
441
 
439
442
  # Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->ClapAudio
440
443
  class ClapAudioSelfOutput(nn.Module):
@@ -1266,9 +1269,9 @@ class ClapTextEncoder(nn.Module):
1266
1269
  all_hidden_states = all_hidden_states + (hidden_states,)
1267
1270
 
1268
1271
  layer_outputs = layer_module(
1269
- hidden_states=hidden_states,
1270
- attention_mask=attention_mask,
1271
- output_attentions=output_attentions,
1272
+ hidden_states,
1273
+ attention_mask,
1274
+ output_attentions,
1272
1275
  **kwargs,
1273
1276
  )
1274
1277
 
@@ -1317,6 +1320,8 @@ class ClapPreTrainedModel(PreTrainedModel):
1317
1320
  if isinstance(module, ClapTextEmbeddings):
1318
1321
  init.normal_(module.position_embeddings.weight, mean=0.0, std=factor * 0.02)
1319
1322
  init.normal_(module.token_type_embeddings.weight, mean=0.0, std=factor * 0.02)
1323
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
1324
+ init.zeros_(module.token_type_ids)
1320
1325
  elif isinstance(module, ClapModel):
1321
1326
  init.constant_(module.logit_scale_a, math.log(self.config.logit_scale_init_value))
1322
1327
  init.constant_(module.logit_scale_t, math.log(self.config.logit_scale_init_value))
@@ -1325,6 +1330,10 @@ class ClapPreTrainedModel(PreTrainedModel):
1325
1330
  elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
1326
1331
  init.zeros_(module.bias)
1327
1332
  init.ones_(module.weight)
1333
+ if getattr(module, "running_mean", None) is not None:
1334
+ init.zeros_(module.running_mean)
1335
+ init.ones_(module.running_var)
1336
+ init.zeros_(module.num_batches_tracked)
1328
1337
  elif isinstance(module, (nn.Conv2d, nn.Linear)):
1329
1338
  in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
1330
1339
  init.normal_(module.weight, std=in_proj_std)
@@ -1332,6 +1341,7 @@ class ClapPreTrainedModel(PreTrainedModel):
1332
1341
  init.zeros_(module.bias)
1333
1342
  elif isinstance(module, ClapAudioSelfAttention):
1334
1343
  init.zeros_(module.relative_position_bias_table)
1344
+ init.copy_(module.relative_position_index, module.create_relative_position_index())
1335
1345
 
1336
1346
 
1337
1347
  class ClapAudioModel(ClapPreTrainedModel):
@@ -1356,6 +1366,7 @@ class ClapAudioModel(ClapPreTrainedModel):
1356
1366
  output_attentions: Optional[bool] = None,
1357
1367
  output_hidden_states: Optional[bool] = None,
1358
1368
  return_dict: Optional[bool] = None,
1369
+ **kwargs,
1359
1370
  ) -> Union[tuple, BaseModelOutputWithPooling]:
1360
1371
  r"""
1361
1372
  is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
@@ -1446,6 +1457,7 @@ class ClapTextModel(ClapPreTrainedModel):
1446
1457
  output_attentions: Optional[bool] = None,
1447
1458
  output_hidden_states: Optional[bool] = None,
1448
1459
  return_dict: Optional[bool] = None,
1460
+ **kwargs,
1449
1461
  ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
1450
1462
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1451
1463
  output_hidden_states = (
@@ -1627,6 +1639,7 @@ class ClapModel(ClapPreTrainedModel):
1627
1639
  output_attentions: Optional[bool] = None,
1628
1640
  output_hidden_states: Optional[bool] = None,
1629
1641
  return_dict: Optional[bool] = None,
1642
+ **kwargs,
1630
1643
  ) -> Union[tuple, ClapOutput]:
1631
1644
  r"""
1632
1645
  is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
@@ -1740,6 +1753,7 @@ class ClapTextModelWithProjection(ClapPreTrainedModel):
1740
1753
  output_attentions: Optional[bool] = None,
1741
1754
  output_hidden_states: Optional[bool] = None,
1742
1755
  return_dict: Optional[bool] = None,
1756
+ **kwargs,
1743
1757
  ) -> Union[tuple, ClapTextModelOutput]:
1744
1758
  r"""
1745
1759
  Examples:
@@ -1803,6 +1817,7 @@ class ClapAudioModelWithProjection(ClapPreTrainedModel):
1803
1817
  output_attentions: Optional[bool] = None,
1804
1818
  output_hidden_states: Optional[bool] = None,
1805
1819
  return_dict: Optional[bool] = None,
1820
+ **kwargs,
1806
1821
  ) -> Union[tuple, ClapAudioModelOutput]:
1807
1822
  r"""
1808
1823
  is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
@@ -416,11 +416,13 @@ class CLIPPreTrainedModel(PreTrainedModel):
416
416
  if isinstance(module, CLIPTextEmbeddings):
417
417
  init.normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02)
418
418
  init.normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02)
419
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
419
420
  elif isinstance(module, CLIPVisionEmbeddings):
420
421
  factor = self.config.initializer_factor
421
422
  init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
422
423
  init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
423
424
  init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
425
+ init.copy_(module.position_ids, torch.arange(module.num_positions).expand((1, -1)))
424
426
  elif isinstance(module, CLIPAttention):
425
427
  factor = self.config.initializer_factor
426
428
  in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor