transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -54,6 +54,112 @@ from .configuration_xlm_roberta import XLMRobertaConfig
54
54
  logger = logging.get_logger(__name__)
55
55
 
56
56
 
57
+ class XLMRobertaEmbeddings(nn.Module):
58
+ """Construct the embeddings from word, position and token_type embeddings."""
59
+
60
+ def __init__(self, config):
61
+ super().__init__()
62
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
63
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
64
+
65
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
66
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
67
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
68
+ self.register_buffer(
69
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
70
+ )
71
+ self.register_buffer(
72
+ "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
73
+ )
74
+
75
+ self.padding_idx = config.pad_token_id
76
+ self.position_embeddings = nn.Embedding(
77
+ config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
78
+ )
79
+
80
+ def forward(
81
+ self,
82
+ input_ids: Optional[torch.LongTensor] = None,
83
+ token_type_ids: Optional[torch.LongTensor] = None,
84
+ position_ids: Optional[torch.LongTensor] = None,
85
+ inputs_embeds: Optional[torch.FloatTensor] = None,
86
+ past_key_values_length: int = 0,
87
+ ) -> torch.Tensor:
88
+ if position_ids is None:
89
+ if input_ids is not None:
90
+ # Create the position ids from the input token ids. Any padded tokens remain padded.
91
+ position_ids = self.create_position_ids_from_input_ids(
92
+ input_ids, self.padding_idx, past_key_values_length
93
+ )
94
+ else:
95
+ position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, self.padding_idx)
96
+
97
+ if input_ids is not None:
98
+ input_shape = input_ids.size()
99
+ else:
100
+ input_shape = inputs_embeds.size()[:-1]
101
+
102
+ batch_size, seq_length = input_shape
103
+
104
+ # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
105
+ # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
106
+ # issue #5664
107
+ if token_type_ids is None:
108
+ if hasattr(self, "token_type_ids"):
109
+ # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
110
+ buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
111
+ buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
112
+ token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
113
+ else:
114
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
115
+
116
+ if inputs_embeds is None:
117
+ inputs_embeds = self.word_embeddings(input_ids)
118
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
119
+ embeddings = inputs_embeds + token_type_embeddings
120
+
121
+ position_embeddings = self.position_embeddings(position_ids)
122
+ embeddings = embeddings + position_embeddings
123
+
124
+ embeddings = self.LayerNorm(embeddings)
125
+ embeddings = self.dropout(embeddings)
126
+ return embeddings
127
+
128
+ @staticmethod
129
+ def create_position_ids_from_inputs_embeds(inputs_embeds, padding_idx):
130
+ """
131
+ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
132
+
133
+ Args:
134
+ inputs_embeds: torch.Tensor
135
+
136
+ Returns: torch.Tensor
137
+ """
138
+ input_shape = inputs_embeds.size()[:-1]
139
+ sequence_length = input_shape[1]
140
+
141
+ position_ids = torch.arange(
142
+ padding_idx + 1, sequence_length + padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
143
+ )
144
+ return position_ids.unsqueeze(0).expand(input_shape)
145
+
146
+ @staticmethod
147
+ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
148
+ """
149
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
150
+ are ignored. This is modified from fairseq's `utils.make_positions`.
151
+
152
+ Args:
153
+ x: torch.Tensor x:
154
+
155
+ Returns: torch.Tensor
156
+ """
157
+ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
158
+ mask = input_ids.ne(padding_idx).int()
159
+ incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
160
+ return incremental_indices.long() + padding_idx
161
+
162
+
57
163
  def eager_attention_forward(
58
164
  module: nn.Module,
59
165
  query: torch.Tensor,
@@ -417,112 +523,9 @@ class XLMRobertaPreTrainedModel(PreTrainedModel):
417
523
  super()._init_weights(module)
418
524
  if isinstance(module, XLMRobertaLMHead):
419
525
  init.zeros_(module.bias)
420
-
421
-
422
- class XLMRobertaEmbeddings(nn.Module):
423
- """Construct the embeddings from word, position and token_type embeddings."""
424
-
425
- def __init__(self, config):
426
- super().__init__()
427
- self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
428
- self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
429
-
430
- self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
431
- self.dropout = nn.Dropout(config.hidden_dropout_prob)
432
- # position_ids (1, len position emb) is contiguous in memory and exported when serialized
433
- self.register_buffer(
434
- "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
435
- )
436
- self.register_buffer(
437
- "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
438
- )
439
-
440
- self.padding_idx = config.pad_token_id
441
- self.position_embeddings = nn.Embedding(
442
- config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
443
- )
444
-
445
- def forward(
446
- self,
447
- input_ids: Optional[torch.LongTensor] = None,
448
- token_type_ids: Optional[torch.LongTensor] = None,
449
- position_ids: Optional[torch.LongTensor] = None,
450
- inputs_embeds: Optional[torch.FloatTensor] = None,
451
- past_key_values_length: int = 0,
452
- ) -> torch.Tensor:
453
- if position_ids is None:
454
- if input_ids is not None:
455
- # Create the position ids from the input token ids. Any padded tokens remain padded.
456
- position_ids = self.create_position_ids_from_input_ids(
457
- input_ids, self.padding_idx, past_key_values_length
458
- )
459
- else:
460
- position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, self.padding_idx)
461
-
462
- if input_ids is not None:
463
- input_shape = input_ids.size()
464
- else:
465
- input_shape = inputs_embeds.size()[:-1]
466
-
467
- batch_size, seq_length = input_shape
468
-
469
- # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
470
- # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
471
- # issue #5664
472
- if token_type_ids is None:
473
- if hasattr(self, "token_type_ids"):
474
- # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
475
- buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
476
- buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
477
- token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
478
- else:
479
- token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
480
-
481
- if inputs_embeds is None:
482
- inputs_embeds = self.word_embeddings(input_ids)
483
- token_type_embeddings = self.token_type_embeddings(token_type_ids)
484
- embeddings = inputs_embeds + token_type_embeddings
485
-
486
- position_embeddings = self.position_embeddings(position_ids)
487
- embeddings = embeddings + position_embeddings
488
-
489
- embeddings = self.LayerNorm(embeddings)
490
- embeddings = self.dropout(embeddings)
491
- return embeddings
492
-
493
- @staticmethod
494
- def create_position_ids_from_inputs_embeds(inputs_embeds, padding_idx):
495
- """
496
- We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
497
-
498
- Args:
499
- inputs_embeds: torch.Tensor
500
-
501
- Returns: torch.Tensor
502
- """
503
- input_shape = inputs_embeds.size()[:-1]
504
- sequence_length = input_shape[1]
505
-
506
- position_ids = torch.arange(
507
- padding_idx + 1, sequence_length + padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
508
- )
509
- return position_ids.unsqueeze(0).expand(input_shape)
510
-
511
- @staticmethod
512
- def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
513
- """
514
- Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
515
- are ignored. This is modified from fairseq's `utils.make_positions`.
516
-
517
- Args:
518
- x: torch.Tensor x:
519
-
520
- Returns: torch.Tensor
521
- """
522
- # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
523
- mask = input_ids.ne(padding_idx).int()
524
- incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
525
- return incremental_indices.long() + padding_idx
526
+ elif isinstance(module, XLMRobertaEmbeddings):
527
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
528
+ init.zeros_(module.token_type_ids)
526
529
 
527
530
 
528
531
  class XLMRobertaEncoder(nn.Module):
@@ -14,7 +14,7 @@
14
14
  # limitations under the License
15
15
  """Tokenization classes for XLM-RoBERTa model (Tokenizers backend)."""
16
16
 
17
- from typing import Optional
17
+ from typing import Optional, Union
18
18
 
19
19
  from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
20
20
  from tokenizers.models import Unigram
@@ -47,16 +47,17 @@ class XLMRobertaTokenizer(TokenizersBackend):
47
47
  pad_token (`str`, optional, defaults to `"<pad>"`): The padding token.
48
48
  mask_token (`str`, optional, defaults to `"<mask>"`): The mask token.
49
49
  add_prefix_space (`bool`, optional, defaults to `True`): Whether to add an initial space.
50
- vocab (`dict`, optional): Custom vocabulary dictionary.
51
- merges (`list`, optional): Custom merges list.
50
+ vocab (`str`, `dict` or `list`, optional): Custom vocabulary dictionary.
52
51
  """
53
52
 
54
53
  vocab_files_names = VOCAB_FILES_NAMES
55
54
  model_input_names = ["input_ids", "attention_mask"]
56
- slow_tokenizer_class = None
55
+ model = Unigram
57
56
 
58
57
  def __init__(
59
58
  self,
59
+ vocab: Optional[Union[str, list[tuple[str, float]]]] = None,
60
+ add_prefix_space: bool = True,
60
61
  bos_token: str = "<s>",
61
62
  eos_token: str = "</s>",
62
63
  sep_token: str = "</s>",
@@ -64,9 +65,6 @@ class XLMRobertaTokenizer(TokenizersBackend):
64
65
  unk_token: str = "<unk>",
65
66
  pad_token: str = "<pad>",
66
67
  mask_token: str = "<mask>",
67
- add_prefix_space: bool = True,
68
- vocab: Optional[dict] = None,
69
- vocab_file: Optional[str] = None,
70
68
  **kwargs,
71
69
  ):
72
70
  self.add_prefix_space = add_prefix_space
@@ -99,11 +97,7 @@ class XLMRobertaTokenizer(TokenizersBackend):
99
97
  ]
100
98
  )
101
99
  self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
102
-
103
- tokenizer_object = self._tokenizer
104
-
105
100
  super().__init__(
106
- tokenizer_object=tokenizer_object,
107
101
  bos_token=bos_token,
108
102
  eos_token=eos_token,
109
103
  sep_token=sep_token,
@@ -116,14 +110,13 @@ class XLMRobertaTokenizer(TokenizersBackend):
116
110
  )
117
111
 
118
112
  self._tokenizer.post_processor = processors.TemplateProcessing(
119
- single=["$A", "</s>"],
120
- pair=["$A", "</s>", "$B", "</s>"],
113
+ single=[str(bos_token), "$A", str(eos_token)],
114
+ pair=[str(bos_token), "$A", str(eos_token), "$B", str(eos_token)],
121
115
  special_tokens=[
122
- ("</s>", self.eos_token_id),
116
+ (str(bos_token), self.bos_token_id),
117
+ (str(eos_token), self.eos_token_id),
123
118
  ],
124
119
  )
125
120
 
126
- self.vocab_file = vocab_file
127
-
128
121
 
129
122
  __all__ = ["XLMRobertaTokenizer"]
@@ -542,6 +542,9 @@ class XLMRobertaXLPreTrainedModel(PreTrainedModel):
542
542
  super()._init_weights(module)
543
543
  if isinstance(module, XLMRobertaXLLMHead):
544
544
  init.zeros_(module.bias)
545
+ elif isinstance(module, XLMRobertaXLEmbeddings):
546
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
547
+ init.zeros_(module.token_type_ids)
545
548
 
546
549
 
547
550
  class XLMRobertaXLPooler(nn.Module):
@@ -1244,7 +1244,9 @@ class XLNetLMHeadModel(XLNetPreTrainedModel, GenerationMixin):
1244
1244
  def set_output_embeddings(self, new_embeddings):
1245
1245
  self.lm_loss = new_embeddings
1246
1246
 
1247
- def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_mems=None, **kwargs):
1247
+ def prepare_inputs_for_generation(
1248
+ self, input_ids, past_key_values=None, use_mems=None, is_first_iteration=False, **kwargs
1249
+ ):
1248
1250
  # Overwritten -- this model has unique input preparation
1249
1251
 
1250
1252
  # Add dummy token at the end (no attention on this one)
@@ -14,7 +14,7 @@
14
14
  # limitations under the License.
15
15
  """Tokenization classes for XLNet model."""
16
16
 
17
- from typing import Optional
17
+ from typing import Optional, Union
18
18
 
19
19
  from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
20
20
  from tokenizers.models import Unigram
@@ -98,10 +98,11 @@ class XLNetTokenizer(TokenizersBackend):
98
98
 
99
99
  vocab_files_names = VOCAB_FILES_NAMES
100
100
  padding_side = "left"
101
+ model = Unigram
101
102
 
102
103
  def __init__(
103
104
  self,
104
- vocab: Optional[list] = None,
105
+ vocab: Optional[Union[str, list[tuple[str, float]]]] = None,
105
106
  unk_id: int = 0,
106
107
  do_lower_case=False,
107
108
  remove_space=True,
@@ -159,13 +160,8 @@ class XLNetTokenizer(TokenizersBackend):
159
160
  self.do_lower_case = do_lower_case
160
161
  self.remove_space = remove_space
161
162
  self.keep_accents = keep_accents
162
-
163
163
  mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
164
-
165
- tokenizer_object = self._tokenizer
166
-
167
164
  super().__init__(
168
- tokenizer_object=tokenizer_object,
169
165
  unk_id=unk_id,
170
166
  do_lower_case=do_lower_case,
171
167
  remove_space=remove_space,
@@ -634,6 +634,9 @@ class XmodPreTrainedModel(PreTrainedModel):
634
634
  super()._init_weights(module)
635
635
  if isinstance(module, XmodLMHead):
636
636
  init.zeros_(module.bias)
637
+ elif isinstance(module, XmodEmbeddings):
638
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
639
+ init.zeros_(module.token_type_ids)
637
640
 
638
641
  def set_default_language(self, language: str):
639
642
  """
@@ -54,7 +54,7 @@ def load_cuda_kernels():
54
54
  global lsh_cumulation
55
55
  if not is_kernels_available():
56
56
  raise ImportError("kernels is not installed, please install it with `pip install kernels`")
57
- from kernels import get_kernel
57
+ from ...integrations.hub_kernels import get_kernel
58
58
 
59
59
  yoso = get_kernel("kernels-community/yoso")
60
60
  lsh_cumulation = yoso.lsh_cumulation
@@ -611,6 +611,9 @@ class YosoPreTrainedModel(PreTrainedModel):
611
611
  super()._init_weights(module)
612
612
  if isinstance(module, YosoLMPredictionHead):
613
613
  init.zeros_(module.bias)
614
+ elif isinstance(module, YosoEmbeddings):
615
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)) + 2)
616
+ init.zeros_(module.token_type_ids)
614
617
 
615
618
 
616
619
  @auto_docstring
@@ -642,6 +645,7 @@ class YosoModel(YosoPreTrainedModel):
642
645
  output_attentions: Optional[bool] = None,
643
646
  output_hidden_states: Optional[bool] = None,
644
647
  return_dict: Optional[bool] = None,
648
+ **kwargs,
645
649
  ) -> Union[tuple, BaseModelOutputWithCrossAttentions]:
646
650
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
647
651
  output_hidden_states = (
@@ -734,6 +738,7 @@ class YosoForMaskedLM(YosoPreTrainedModel):
734
738
  output_attentions: Optional[bool] = None,
735
739
  output_hidden_states: Optional[bool] = None,
736
740
  return_dict: Optional[bool] = None,
741
+ **kwargs,
737
742
  ) -> Union[tuple, MaskedLMOutput]:
738
743
  r"""
739
744
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -823,6 +828,7 @@ class YosoForSequenceClassification(YosoPreTrainedModel):
823
828
  output_attentions: Optional[bool] = None,
824
829
  output_hidden_states: Optional[bool] = None,
825
830
  return_dict: Optional[bool] = None,
831
+ **kwargs,
826
832
  ) -> Union[tuple, SequenceClassifierOutput]:
827
833
  r"""
828
834
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -904,6 +910,7 @@ class YosoForMultipleChoice(YosoPreTrainedModel):
904
910
  output_attentions: Optional[bool] = None,
905
911
  output_hidden_states: Optional[bool] = None,
906
912
  return_dict: Optional[bool] = None,
913
+ **kwargs,
907
914
  ) -> Union[tuple, MultipleChoiceModelOutput]:
908
915
  r"""
909
916
  input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
@@ -1009,6 +1016,7 @@ class YosoForTokenClassification(YosoPreTrainedModel):
1009
1016
  output_attentions: Optional[bool] = None,
1010
1017
  output_hidden_states: Optional[bool] = None,
1011
1018
  return_dict: Optional[bool] = None,
1019
+ **kwargs,
1012
1020
  ) -> Union[tuple, TokenClassifierOutput]:
1013
1021
  r"""
1014
1022
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1085,6 +1093,7 @@ class YosoForQuestionAnswering(YosoPreTrainedModel):
1085
1093
  output_attentions: Optional[bool] = None,
1086
1094
  output_hidden_states: Optional[bool] = None,
1087
1095
  return_dict: Optional[bool] = None,
1096
+ **kwargs,
1088
1097
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
1089
1098
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1090
1099
 
@@ -870,6 +870,7 @@ class ZambaModel(ZambaPreTrainedModel):
870
870
  output_hidden_states: Optional[bool] = None,
871
871
  return_dict: Optional[bool] = None,
872
872
  cache_position: Optional[torch.LongTensor] = None,
873
+ **kwargs,
873
874
  ) -> Union[tuple, BaseModelOutputWithPast]:
874
875
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
875
876
  output_hidden_states = (
@@ -1098,6 +1099,7 @@ class ZambaForCausalLM(ZambaPreTrainedModel, GenerationMixin):
1098
1099
  cache_position=None,
1099
1100
  position_ids=None,
1100
1101
  use_cache=True,
1102
+ is_first_iteration=False,
1101
1103
  **kwargs,
1102
1104
  ):
1103
1105
  # Overwritten -- has a unique cache type, `ZambaHybridDynamicCache`
@@ -1131,7 +1133,7 @@ class ZambaForCausalLM(ZambaPreTrainedModel, GenerationMixin):
1131
1133
  position_ids = position_ids[:, -input_ids.shape[1] :]
1132
1134
 
1133
1135
  # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1134
- if inputs_embeds is not None and empty_past_kv:
1136
+ if inputs_embeds is not None and is_first_iteration:
1135
1137
  model_inputs = {"inputs_embeds": inputs_embeds}
1136
1138
  else:
1137
1139
  model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases
@@ -1192,6 +1194,7 @@ class ZambaForSequenceClassification(ZambaPreTrainedModel):
1192
1194
  output_attentions: Optional[bool] = None,
1193
1195
  output_hidden_states: Optional[bool] = None,
1194
1196
  return_dict: Optional[bool] = None,
1197
+ **kwargs,
1195
1198
  ) -> Union[tuple, SequenceClassifierOutputWithPast]:
1196
1199
  r"""
1197
1200
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -41,6 +41,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
41
41
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
42
42
  from ...processing_utils import Unpack
43
43
  from ...utils import auto_docstring, logging
44
+ from ...utils.generic import maybe_autocast
44
45
  from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
45
46
  from .configuration_zamba2 import Zamba2Config
46
47
 
@@ -224,7 +225,7 @@ class Zamba2RotaryEmbedding(nn.Module):
224
225
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
225
226
 
226
227
  self.register_buffer("inv_freq", inv_freq, persistent=False)
227
- self.original_inv_freq = inv_freq
228
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
228
229
 
229
230
  @staticmethod
230
231
  def compute_default_rope_parameters(
@@ -263,7 +264,7 @@ class Zamba2RotaryEmbedding(nn.Module):
263
264
  position_ids_expanded = position_ids[:, None, :].float()
264
265
 
265
266
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
266
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
267
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
267
268
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
268
269
  emb = torch.cat((freqs, freqs), dim=-1)
269
270
  cos = emb.cos() * self.attention_scaling
@@ -424,7 +425,6 @@ class Zamba2Attention(nn.Module):
424
425
  attention_mask: Optional[torch.Tensor] = None,
425
426
  past_key_values: Optional[Zamba2HybridDynamicCache] = None,
426
427
  position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
427
- position_ids: Optional[torch.Tensor] = None,
428
428
  **kwargs: Unpack[FlashAttentionKwargs],
429
429
  ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
430
430
  input_shape = hidden_states.shape[:-1]
@@ -1294,6 +1294,7 @@ class Zamba2Model(Zamba2PreTrainedModel):
1294
1294
  output_hidden_states: Optional[bool] = None,
1295
1295
  return_dict: Optional[bool] = None,
1296
1296
  cache_position: Optional[torch.LongTensor] = None,
1297
+ **kwargs,
1297
1298
  ) -> Union[tuple, BaseModelOutputWithPast]:
1298
1299
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1299
1300
  output_hidden_states = (
@@ -1544,6 +1545,7 @@ class Zamba2ForCausalLM(Zamba2PreTrainedModel, GenerationMixin):
1544
1545
  cache_position=None,
1545
1546
  position_ids=None,
1546
1547
  use_cache=True,
1548
+ is_first_iteration=False,
1547
1549
  **kwargs,
1548
1550
  ):
1549
1551
  # Overwritten -- has a unique cache type, `Zamba2HybridDynamicCache`
@@ -1577,7 +1579,7 @@ class Zamba2ForCausalLM(Zamba2PreTrainedModel, GenerationMixin):
1577
1579
  position_ids = position_ids[:, -input_ids.shape[1] :]
1578
1580
 
1579
1581
  # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1580
- if inputs_embeds is not None and empty_past_kv:
1582
+ if inputs_embeds is not None and is_first_iteration:
1581
1583
  model_inputs = {"inputs_embeds": inputs_embeds}
1582
1584
  else:
1583
1585
  model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases
@@ -1638,6 +1640,7 @@ class Zamba2ForSequenceClassification(Zamba2PreTrainedModel):
1638
1640
  output_attentions: Optional[bool] = None,
1639
1641
  output_hidden_states: Optional[bool] = None,
1640
1642
  return_dict: Optional[bool] = None,
1643
+ **kwargs,
1641
1644
  ) -> Union[tuple, SequenceClassifierOutputWithPast]:
1642
1645
  r"""
1643
1646
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -232,7 +232,6 @@ class Zamba2Attention(ZambaAttention):
232
232
  attention_mask: Optional[torch.Tensor] = None,
233
233
  past_key_values: Optional[Zamba2HybridDynamicCache] = None,
234
234
  position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
235
- position_ids: Optional[torch.Tensor] = None,
236
235
  **kwargs: Unpack[FlashAttentionKwargs],
237
236
  ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
238
237
  input_shape = hidden_states.shape[:-1]
@@ -993,6 +992,7 @@ class Zamba2Model(ZambaModel, Zamba2PreTrainedModel):
993
992
  output_hidden_states: Optional[bool] = None,
994
993
  return_dict: Optional[bool] = None,
995
994
  cache_position: Optional[torch.LongTensor] = None,
995
+ **kwargs,
996
996
  ) -> Union[tuple, BaseModelOutputWithPast]:
997
997
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
998
998
  output_hidden_states = (
@@ -37,7 +37,7 @@ class ZoeDepthConfig(PreTrainedConfig):
37
37
  documentation from [`PreTrainedConfig`] for more information.
38
38
 
39
39
  Args:
40
- backbone_config (`Union[dict[str, Any], PreTrainedConfig]`, *optional*, defaults to `BeitConfig()`):
40
+ backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `BeitConfig()`):
41
41
  The configuration of the backbone model.
42
42
  backbone (`str`, *optional*):
43
43
  Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
@@ -171,9 +171,7 @@ class ZoeDepthImageProcessorFast(BaseImageProcessorFast):
171
171
  if do_normalize:
172
172
  stacked_images = self.normalize(stacked_images, image_mean, image_std)
173
173
  resized_images_grouped[shape] = stacked_images
174
- resized_images = reorder_images(resized_images_grouped, grouped_images_index)
175
-
176
- processed_images = torch.stack(resized_images, dim=0) if return_tensors else resized_images
174
+ processed_images = reorder_images(resized_images_grouped, grouped_images_index)
177
175
 
178
176
  return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
179
177
 
@@ -21,6 +21,7 @@ from typing import Optional, Union
21
21
  import torch
22
22
  from torch import nn
23
23
 
24
+ from ... import initialization as init
24
25
  from ...activations import ACT2FN
25
26
  from ...modeling_outputs import DepthEstimatorOutput
26
27
  from ...modeling_utils import PreTrainedModel
@@ -1211,6 +1212,12 @@ class ZoeDepthPreTrainedModel(PreTrainedModel):
1211
1212
  input_modalities = ("image",)
1212
1213
  supports_gradient_checkpointing = True
1213
1214
 
1215
+ def _init_weights(self, module):
1216
+ super()._init_weights(module)
1217
+ if isinstance(module, LogBinomialSoftmax):
1218
+ init.copy_(module.k_idx, torch.arange(0, module.k).view(1, -1, 1, 1))
1219
+ init.copy_(module.k_minus_1, torch.tensor([module.k - 1]).view(1, -1, 1, 1))
1220
+
1214
1221
 
1215
1222
  @auto_docstring(
1216
1223
  custom_intro="""
@@ -1251,6 +1258,7 @@ class ZoeDepthForDepthEstimation(ZoeDepthPreTrainedModel):
1251
1258
  output_attentions: Optional[bool] = None,
1252
1259
  output_hidden_states: Optional[bool] = None,
1253
1260
  return_dict: Optional[bool] = None,
1261
+ **kwargs,
1254
1262
  ) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]:
1255
1263
  r"""
1256
1264
  labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
@@ -18,11 +18,11 @@ import warnings
18
18
  from pathlib import Path
19
19
  from typing import TYPE_CHECKING, Any, Optional, Union
20
20
 
21
- from huggingface_hub import model_info
21
+ from huggingface_hub import is_offline_mode, model_info
22
22
 
23
23
  from ..configuration_utils import PreTrainedConfig
24
24
  from ..dynamic_module_utils import get_class_from_dynamic_module
25
- from ..feature_extraction_utils import PreTrainedFeatureExtractor
25
+ from ..feature_extraction_utils import FeatureExtractionMixin, PreTrainedFeatureExtractor
26
26
  from ..image_processing_utils import BaseImageProcessor
27
27
  from ..models.auto.configuration_auto import AutoConfig
28
28
  from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
@@ -38,7 +38,6 @@ from ..utils import (
38
38
  extract_commit_hash,
39
39
  find_adapter_config_file,
40
40
  is_kenlm_available,
41
- is_offline_mode,
42
41
  is_peft_available,
43
42
  is_pyctcdecode_available,
44
43
  is_torch_available,
@@ -278,7 +277,7 @@ SUPPORTED_TASKS = {
278
277
  "image-to-text": {
279
278
  "impl": ImageToTextPipeline,
280
279
  "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
281
- "default": {"model": ("ydshieh/vit-gpt2-coco-en", "5bebf1e")},
280
+ "default": {"model": ("ydshieh/vit-gpt2-coco-en", "e460201")},
282
281
  "type": "multimodal",
283
282
  },
284
283
  "image-text-to-text": {
@@ -701,12 +700,14 @@ def pipeline(
701
700
 
702
701
  code_revision = kwargs.pop("code_revision", None)
703
702
  commit_hash = kwargs.pop("_commit_hash", None)
703
+ local_files_only = kwargs.get("local_files_only", False)
704
704
 
705
705
  hub_kwargs = {
706
706
  "revision": revision,
707
707
  "token": token,
708
708
  "trust_remote_code": trust_remote_code,
709
709
  "_commit_hash": commit_hash,
710
+ "local_files_only": local_files_only,
710
711
  }
711
712
 
712
713
  if task is None and model is None:
@@ -987,12 +988,13 @@ def pipeline(
987
988
  feature_extractor = AutoFeatureExtractor.from_pretrained(
988
989
  feature_extractor, _from_pipeline=task, **hub_kwargs, **model_kwargs
989
990
  )
991
+ config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(
992
+ pretrained_model_name_or_path or model_name,
993
+ **hub_kwargs,
994
+ )
995
+ processor_class = config_dict.get("processor_class", None)
990
996
 
991
- if (
992
- feature_extractor._processor_class
993
- and feature_extractor._processor_class.endswith("WithLM")
994
- and isinstance(model_name, str)
995
- ):
997
+ if processor_class is not None and processor_class.endswith("WithLM") and isinstance(model_name, str):
996
998
  try:
997
999
  import kenlm # to trigger `ImportError` if not installed
998
1000
  from pyctcdecode import BeamSearchDecoderCTC