transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -150,8 +150,8 @@ class LayoutXLMTokenizer(TokenizersBackend):
150
150
  refer to this superclass for more information regarding those methods.
151
151
 
152
152
  Args:
153
- vocab (`list[tuple[str, float]]`, *optional*):
154
- Vocabulary for the tokenizer as a list of (token, score) tuples.
153
+ vocab (`str`, `dict` or `list`, *optional*):
154
+ Vocabulary for the tokenizer as a path, a dictionary or a list of `(token, score)` tuples.
155
155
  bos_token (`str`, *optional*, defaults to `"<s>"`):
156
156
  The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
157
157
 
@@ -206,12 +206,11 @@ class LayoutXLMTokenizer(TokenizersBackend):
206
206
 
207
207
  vocab_files_names = VOCAB_FILES_NAMES
208
208
  model_input_names = ["input_ids", "attention_mask"]
209
- slow_tokenizer_class = None
209
+ model = Unigram
210
210
 
211
211
  def __init__(
212
212
  self,
213
- vocab_file=None,
214
- vocab=None,
213
+ vocab: Optional[Union[str, list]] = None,
215
214
  bos_token="<s>",
216
215
  eos_token="</s>",
217
216
  sep_token="</s>",
@@ -229,17 +228,10 @@ class LayoutXLMTokenizer(TokenizersBackend):
229
228
  ):
230
229
  # Mask token behave like a normal word, i.e. include the space before it
231
230
  mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
232
-
233
231
  self.add_prefix_space = add_prefix_space
234
232
 
235
- # Build vocab from list of tuples if provided, else use default
236
- # Handle both list of tuples (when creating) and dict (when loading)
237
233
  if vocab is not None:
238
- if isinstance(vocab, dict):
239
- # Convert dict to list of tuples
240
- self._vocab = [(token, score) for token, score in vocab.items()]
241
- else:
242
- self._vocab = vocab
234
+ self._vocab = vocab
243
235
  else:
244
236
  self._vocab = [
245
237
  ("<s>", 0.0),
@@ -250,10 +242,7 @@ class LayoutXLMTokenizer(TokenizersBackend):
250
242
  if mask_token not in [v[0] for v in self._vocab]:
251
243
  self._vocab.append((str(mask_token), 0.0))
252
244
 
253
- # Create the Unigram tokenizer
254
245
  self._tokenizer = Tokenizer(Unigram(self._vocab, unk_id=3, byte_fallback=False))
255
-
256
- # Set up normalizer (strip right, replace multiple spaces)
257
246
  self._tokenizer.normalizer = normalizers.Sequence(
258
247
  [
259
248
  normalizers.Strip(left=False, right=True),
@@ -261,30 +250,11 @@ class LayoutXLMTokenizer(TokenizersBackend):
261
250
  ]
262
251
  )
263
252
 
264
- # Set up pre_tokenizer (Metaspace)
265
253
  prepend_scheme = _get_prepend_scheme(add_prefix_space, self)
266
254
  self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
267
255
 
268
- # Set up decoder
269
256
  self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
270
257
 
271
- # Set up post_processor for XLM-RoBERTa style
272
- # Get token IDs
273
- cls_token_id = self._get_token_id(str(cls_token))
274
- sep_token_id = self._get_token_id(str(sep_token))
275
-
276
- self._tokenizer.post_processor = processors.TemplateProcessing(
277
- single="<s> $A </s>",
278
- pair="<s> $A </s> </s> $B </s>",
279
- special_tokens=[
280
- ("<s>", cls_token_id),
281
- ("</s>", sep_token_id),
282
- ],
283
- )
284
-
285
- tokenizer_object = self._tokenizer
286
-
287
- # additional properties
288
258
  self.cls_token_box = cls_token_box
289
259
  self.sep_token_box = sep_token_box
290
260
  self.pad_token_box = pad_token_box
@@ -292,7 +262,6 @@ class LayoutXLMTokenizer(TokenizersBackend):
292
262
  self.only_label_first_subword = only_label_first_subword
293
263
 
294
264
  super().__init__(
295
- tokenizer_object=tokenizer_object,
296
265
  bos_token=bos_token,
297
266
  eos_token=eos_token,
298
267
  sep_token=sep_token,
@@ -300,7 +269,6 @@ class LayoutXLMTokenizer(TokenizersBackend):
300
269
  unk_token=unk_token,
301
270
  pad_token=pad_token,
302
271
  mask_token=mask_token,
303
- vocab_file=vocab_file,
304
272
  vocab=vocab,
305
273
  add_prefix_space=add_prefix_space,
306
274
  cls_token_box=cls_token_box,
@@ -311,7 +279,14 @@ class LayoutXLMTokenizer(TokenizersBackend):
311
279
  **kwargs,
312
280
  )
313
281
 
314
- self.vocab_file = vocab_file
282
+ self._tokenizer.post_processor = processors.TemplateProcessing(
283
+ single=f"{str(self.cls_token)}:0 $A:0 {str(self.sep_token)}:0",
284
+ pair=f"{str(self.cls_token)}:0 $A:0 {str(self.sep_token)}:0 {str(self.sep_token)}:0 $B:0 {str(self.sep_token)}:0",
285
+ special_tokens=[
286
+ (str(self.cls_token), self.cls_token_id),
287
+ (str(self.sep_token), self.sep_token_id),
288
+ ],
289
+ )
315
290
 
316
291
  def _get_token_id(self, token: str) -> int:
317
292
  """Helper to get token ID from vocab."""
@@ -23,6 +23,7 @@ import torch
23
23
  from torch import nn
24
24
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
25
25
 
26
+ from ... import initialization as init
26
27
  from ...activations import ACT2FN
27
28
  from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
28
29
  from ...generation import GenerationMixin
@@ -1077,6 +1078,11 @@ class LEDPreTrainedModel(PreTrainedModel):
1077
1078
  }
1078
1079
  return dummy_inputs
1079
1080
 
1081
+ def _init_weights(self, module):
1082
+ super()._init_weights(module)
1083
+ if isinstance(module, LEDForConditionalGeneration):
1084
+ init.zeros_(module.final_logits_bias)
1085
+
1080
1086
 
1081
1087
  @dataclass
1082
1088
  @auto_docstring(
@@ -1379,6 +1385,7 @@ class LEDEncoder(LEDPreTrainedModel):
1379
1385
  output_attentions=None,
1380
1386
  output_hidden_states=None,
1381
1387
  return_dict=None,
1388
+ **kwargs,
1382
1389
  ):
1383
1390
  r"""
1384
1391
  Args:
@@ -1573,6 +1580,7 @@ class LEDDecoder(LEDPreTrainedModel):
1573
1580
  output_hidden_states=None,
1574
1581
  return_dict=None,
1575
1582
  cache_position=None,
1583
+ **kwargs,
1576
1584
  ):
1577
1585
  r"""
1578
1586
  Args:
@@ -1788,6 +1796,7 @@ class LEDModel(LEDPreTrainedModel):
1788
1796
  output_hidden_states: Optional[bool] = None,
1789
1797
  return_dict: Optional[bool] = None,
1790
1798
  cache_position: Optional[torch.Tensor] = None,
1799
+ **kwargs,
1791
1800
  ) -> Union[tuple[torch.Tensor], LEDSeq2SeqModelOutput]:
1792
1801
  r"""
1793
1802
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1938,6 +1947,7 @@ class LEDForConditionalGeneration(LEDPreTrainedModel, GenerationMixin):
1938
1947
  output_hidden_states: Optional[bool] = None,
1939
1948
  return_dict: Optional[bool] = None,
1940
1949
  cache_position: Optional[torch.Tensor] = None,
1950
+ **kwargs,
1941
1951
  ) -> Union[tuple[torch.Tensor], LEDSeq2SeqLMOutput]:
1942
1952
  r"""
1943
1953
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -2120,6 +2130,7 @@ class LEDForSequenceClassification(LEDPreTrainedModel):
2120
2130
  output_attentions: Optional[bool] = None,
2121
2131
  output_hidden_states: Optional[bool] = None,
2122
2132
  return_dict: Optional[bool] = None,
2133
+ **kwargs,
2123
2134
  ) -> Union[tuple[torch.Tensor], LEDSeq2SeqSequenceClassifierOutput]:
2124
2135
  r"""
2125
2136
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -2258,6 +2269,7 @@ class LEDForQuestionAnswering(LEDPreTrainedModel):
2258
2269
  output_attentions: Optional[bool] = None,
2259
2270
  output_hidden_states: Optional[bool] = None,
2260
2271
  return_dict: Optional[bool] = None,
2272
+ **kwargs,
2261
2273
  ) -> Union[tuple[torch.Tensor], LEDSeq2SeqQuestionAnsweringModelOutput]:
2262
2274
  r"""
2263
2275
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -21,6 +21,7 @@ from typing import Optional, Union
21
21
  import torch
22
22
  from torch import nn
23
23
 
24
+ from ... import initialization as init
24
25
  from ...modeling_outputs import (
25
26
  BaseModelOutputWithNoAttention,
26
27
  BaseModelOutputWithPoolingAndNoAttention,
@@ -165,6 +166,7 @@ class LevitAttention(nn.Module):
165
166
 
166
167
  points = list(itertools.product(range(resolution), range(resolution)))
167
168
  len_points = len(points)
169
+ self.len_points = len_points
168
170
  attention_offsets, indices = {}, []
169
171
  for p1 in points:
170
172
  for p2 in points:
@@ -172,6 +174,7 @@ class LevitAttention(nn.Module):
172
174
  if offset not in attention_offsets:
173
175
  attention_offsets[offset] = len(attention_offsets)
174
176
  indices.append(attention_offsets[offset])
177
+ self.indices = indices
175
178
 
176
179
  self.attention_bias_cache = {}
177
180
  self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
@@ -243,6 +246,8 @@ class LevitAttentionSubsample(nn.Module):
243
246
  points = list(itertools.product(range(resolution_in), range(resolution_in)))
244
247
  points_ = list(itertools.product(range(resolution_out), range(resolution_out)))
245
248
  len_points, len_points_ = len(points), len(points_)
249
+ self.len_points_ = len_points_
250
+ self.len_points = len_points
246
251
  attention_offsets, indices = {}, []
247
252
  for p1 in points_:
248
253
  for p2 in points:
@@ -251,6 +256,7 @@ class LevitAttentionSubsample(nn.Module):
251
256
  if offset not in attention_offsets:
252
257
  attention_offsets[offset] = len(attention_offsets)
253
258
  indices.append(attention_offsets[offset])
259
+ self.indices = indices
254
260
 
255
261
  self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
256
262
  self.register_buffer(
@@ -472,6 +478,18 @@ class LevitPreTrainedModel(PreTrainedModel):
472
478
  input_modalities = ("image",)
473
479
  _no_split_modules = ["LevitResidualLayer"]
474
480
 
481
+ def _init_weights(self, module):
482
+ super()._init_weights(module)
483
+ if isinstance(module, LevitAttention):
484
+ init.copy_(
485
+ module.attention_bias_idxs, torch.LongTensor(module.indices).view(module.len_points, module.len_points)
486
+ )
487
+ elif isinstance(module, LevitAttentionSubsample):
488
+ init.copy_(
489
+ module.attention_bias_idxs,
490
+ torch.LongTensor(module.indices).view(module.len_points_, module.len_points),
491
+ )
492
+
475
493
 
476
494
  @auto_docstring
477
495
  class LevitModel(LevitPreTrainedModel):
@@ -489,6 +507,7 @@ class LevitModel(LevitPreTrainedModel):
489
507
  pixel_values: Optional[torch.FloatTensor] = None,
490
508
  output_hidden_states: Optional[bool] = None,
491
509
  return_dict: Optional[bool] = None,
510
+ **kwargs,
492
511
  ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
493
512
  output_hidden_states = (
494
513
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -550,6 +569,7 @@ class LevitForImageClassification(LevitPreTrainedModel):
550
569
  labels: Optional[torch.LongTensor] = None,
551
570
  output_hidden_states: Optional[bool] = None,
552
571
  return_dict: Optional[bool] = None,
572
+ **kwargs,
553
573
  ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
554
574
  r"""
555
575
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -616,6 +636,7 @@ class LevitForImageClassificationWithTeacher(LevitPreTrainedModel):
616
636
  pixel_values: Optional[torch.FloatTensor] = None,
617
637
  output_hidden_states: Optional[bool] = None,
618
638
  return_dict: Optional[bool] = None,
639
+ **kwargs,
619
640
  ) -> Union[tuple, LevitForImageClassificationWithTeacherOutput]:
620
641
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
621
642
 
@@ -26,7 +26,7 @@ from torch import nn
26
26
 
27
27
  from ...cache_utils import Cache
28
28
  from ...generation import GenerationMixin
29
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
29
+ from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
30
30
  from ...masking_utils import create_causal_mask
31
31
  from ...modeling_layers import GradientCheckpointingLayer
32
32
  from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -34,7 +34,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
34
34
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
35
35
  from ...processing_utils import Unpack
36
36
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
37
- from ...utils.generic import check_model_inputs
37
+ from ...utils.generic import check_model_inputs, maybe_autocast
38
38
  from ...utils.import_utils import is_causal_conv1d_available, is_torchdynamo_compiling
39
39
  from .configuration_lfm2 import Lfm2Config
40
40
 
@@ -83,7 +83,7 @@ class Lfm2RotaryEmbedding(nn.Module):
83
83
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
84
84
 
85
85
  self.register_buffer("inv_freq", inv_freq, persistent=False)
86
- self.original_inv_freq = inv_freq
86
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
87
87
 
88
88
  @staticmethod
89
89
  def compute_default_rope_parameters(
@@ -122,7 +122,7 @@ class Lfm2RotaryEmbedding(nn.Module):
122
122
  position_ids_expanded = position_ids[:, None, :].float()
123
123
 
124
124
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
125
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
125
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
126
126
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
127
127
  emb = torch.cat((freqs, freqs), dim=-1)
128
128
  cos = emb.cos() * self.attention_scaling
@@ -358,6 +358,7 @@ def eager_attention_forward(
358
358
  return attn_output, attn_weights
359
359
 
360
360
 
361
+ @use_kernelized_func(apply_rotary_pos_emb)
361
362
  class Lfm2Attention(nn.Module):
362
363
  """Multi-headed attention from 'Attention Is All You Need' paper"""
363
364
 
@@ -372,7 +373,6 @@ class Lfm2Attention(nn.Module):
372
373
  self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
373
374
  self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
374
375
  self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
375
- self.rotary_fn = apply_rotary_pos_emb
376
376
  self.out_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
377
377
  self.q_layernorm = Lfm2RMSNorm(self.head_dim, eps=config.norm_eps)
378
378
  self.k_layernorm = Lfm2RMSNorm(self.head_dim, eps=config.norm_eps)
@@ -384,7 +384,6 @@ class Lfm2Attention(nn.Module):
384
384
  attention_mask: Optional[torch.Tensor],
385
385
  past_key_values: Optional[Lfm2HybridConvCache] = None,
386
386
  cache_position: Optional[torch.LongTensor] = None,
387
- position_ids: Optional[torch.LongTensor] = None,
388
387
  **kwargs,
389
388
  ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
390
389
  input_shape = hidden_states.shape[:-1]
@@ -233,7 +233,6 @@ class Lfm2Attention(LlamaAttention):
233
233
  attention_mask: Optional[torch.Tensor],
234
234
  past_key_values: Optional[Lfm2HybridConvCache] = None,
235
235
  cache_position: Optional[torch.LongTensor] = None,
236
- position_ids: Optional[torch.LongTensor] = None,
237
236
  **kwargs,
238
237
  ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
239
238
  input_shape = hidden_states.shape[:-1]
@@ -27,7 +27,12 @@ from torch import nn
27
27
  from ... import initialization as init
28
28
  from ...cache_utils import Cache
29
29
  from ...generation import GenerationMixin
30
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
30
+ from ...integrations import (
31
+ use_experts_implementation,
32
+ use_kernel_forward_from_hub,
33
+ use_kernel_func_from_hub,
34
+ use_kernelized_func,
35
+ )
31
36
  from ...masking_utils import create_causal_mask
32
37
  from ...modeling_layers import GradientCheckpointingLayer
33
38
  from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, MoeModelOutputWithPast
@@ -35,7 +40,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
35
40
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
36
41
  from ...processing_utils import Unpack
37
42
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
38
- from ...utils.generic import check_model_inputs
43
+ from ...utils.generic import check_model_inputs, maybe_autocast
39
44
  from ...utils.import_utils import is_causal_conv1d_available, is_torchdynamo_compiling
40
45
  from .configuration_lfm2_moe import Lfm2MoeConfig
41
46
 
@@ -84,7 +89,7 @@ class Lfm2MoeRotaryEmbedding(nn.Module):
84
89
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
85
90
 
86
91
  self.register_buffer("inv_freq", inv_freq, persistent=False)
87
- self.original_inv_freq = inv_freq
92
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
88
93
 
89
94
  @staticmethod
90
95
  def compute_default_rope_parameters(
@@ -123,7 +128,7 @@ class Lfm2MoeRotaryEmbedding(nn.Module):
123
128
  position_ids_expanded = position_ids[:, None, :].float()
124
129
 
125
130
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
126
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
131
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
127
132
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
128
133
  emb = torch.cat((freqs, freqs), dim=-1)
129
134
  cos = emb.cos() * self.attention_scaling
@@ -145,6 +150,7 @@ class Lfm2MoeMLP(nn.Module):
145
150
  return self.w2(F.silu(self.w1(x)) * self.w3(x))
146
151
 
147
152
 
153
+ @use_experts_implementation
148
154
  class Lfm2MoeExperts(nn.Module):
149
155
  """Collection of expert weights stored as 3D tensors."""
150
156
 
@@ -155,6 +161,7 @@ class Lfm2MoeExperts(nn.Module):
155
161
  self.intermediate_dim = config.moe_intermediate_size
156
162
  self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
157
163
  self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
164
+ self.act_fn = F.silu
158
165
 
159
166
  def forward(
160
167
  self,
@@ -175,7 +182,7 @@ class Lfm2MoeExperts(nn.Module):
175
182
  top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
176
183
  current_state = hidden_states[token_idx]
177
184
  gate, up = nn.functional.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1)
178
- current_hidden_states = F.silu(gate) * up
185
+ current_hidden_states = self.act_fn(gate) * up
179
186
  current_hidden_states = nn.functional.linear(current_hidden_states, self.down_proj[expert_idx])
180
187
  current_hidden_states = current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
181
188
  final_hidden_states.index_add_(0, token_idx, current_hidden_states.to(final_hidden_states.dtype))
@@ -426,6 +433,7 @@ def eager_attention_forward(
426
433
  return attn_output, attn_weights
427
434
 
428
435
 
436
+ @use_kernelized_func(apply_rotary_pos_emb)
429
437
  class Lfm2MoeAttention(nn.Module):
430
438
  """Multi-headed attention from 'Attention Is All You Need' paper"""
431
439
 
@@ -440,7 +448,6 @@ class Lfm2MoeAttention(nn.Module):
440
448
  self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
441
449
  self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
442
450
  self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
443
- self.rotary_fn = apply_rotary_pos_emb
444
451
  self.out_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
445
452
  self.q_layernorm = Lfm2MoeRMSNorm(self.head_dim, eps=config.norm_eps)
446
453
  self.k_layernorm = Lfm2MoeRMSNorm(self.head_dim, eps=config.norm_eps)
@@ -452,7 +459,6 @@ class Lfm2MoeAttention(nn.Module):
452
459
  attention_mask: Optional[torch.Tensor],
453
460
  past_key_values: Optional[Lfm2MoeHybridConvCache] = None,
454
461
  cache_position: Optional[torch.LongTensor] = None,
455
- position_ids: Optional[torch.LongTensor] = None,
456
462
  **kwargs,
457
463
  ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
458
464
  input_shape = hidden_states.shape[:-1]
@@ -672,7 +678,7 @@ class Lfm2MoePreTrainedModel(PreTrainedModel):
672
678
  _supports_flash_attn = True
673
679
  _supports_sdpa = True
674
680
  _supports_flex_attn = True
675
- _can_compile_fullgraph = False
681
+ _can_compile_fullgraph = False # uses a non-compilable custom cache class Lfm2MoeHybridConvCache
676
682
  _supports_attention_backend = True
677
683
  _can_record_outputs = {
678
684
  "hidden_states": Lfm2MoeDecoderLayer,
@@ -685,6 +691,9 @@ class Lfm2MoePreTrainedModel(PreTrainedModel):
685
691
  if isinstance(module, Lfm2MoeExperts):
686
692
  init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
687
693
  init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
694
+ elif isinstance(module, Lfm2MoeSparseMoeBlock):
695
+ if module.use_expert_bias:
696
+ init.zeros_(module.expert_bias)
688
697
 
689
698
 
690
699
  @auto_docstring
@@ -72,33 +72,7 @@ class Lfm2MoeMLP(Lfm2MLP):
72
72
  class Lfm2MoeExperts(Qwen2MoeExperts):
73
73
  def __init__(self, config):
74
74
  super().__init__(config)
75
- del self.act_fn
76
-
77
- def forward(
78
- self,
79
- hidden_states: torch.Tensor,
80
- top_k_index: torch.Tensor,
81
- top_k_weights: torch.Tensor,
82
- ) -> torch.Tensor:
83
- final_hidden_states = torch.zeros_like(hidden_states)
84
- with torch.no_grad():
85
- expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts)
86
- expert_mask = expert_mask.permute(2, 1, 0)
87
- expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
88
-
89
- for expert_idx in expert_hit:
90
- expert_idx = expert_idx[0]
91
- if expert_idx == self.num_experts:
92
- continue
93
- top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
94
- current_state = hidden_states[token_idx]
95
- gate, up = nn.functional.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1)
96
- current_hidden_states = F.silu(gate) * up
97
- current_hidden_states = nn.functional.linear(current_hidden_states, self.down_proj[expert_idx])
98
- current_hidden_states = current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
99
- final_hidden_states.index_add_(0, token_idx, current_hidden_states.to(final_hidden_states.dtype))
100
-
101
- return final_hidden_states
75
+ self.act_fn = F.silu
102
76
 
103
77
 
104
78
  class Lfm2MoeSparseMoeBlock(nn.Module):
@@ -160,7 +134,7 @@ class Lfm2MoeDecoderLayer(Lfm2DecoderLayer):
160
134
 
161
135
 
162
136
  class Lfm2MoePreTrainedModel(LlamaPreTrainedModel):
163
- _can_compile_fullgraph = False
137
+ _can_compile_fullgraph = False # uses a non-compilable custom cache class Lfm2MoeHybridConvCache
164
138
 
165
139
  @torch.no_grad()
166
140
  def _init_weights(self, module):
@@ -168,6 +142,9 @@ class Lfm2MoePreTrainedModel(LlamaPreTrainedModel):
168
142
  if isinstance(module, Lfm2MoeExperts):
169
143
  init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
170
144
  init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
145
+ elif isinstance(module, Lfm2MoeSparseMoeBlock):
146
+ if module.use_expert_bias:
147
+ init.zeros_(module.expert_bias)
171
148
 
172
149
 
173
150
  class Lfm2MoeModel(MixtralModel):
@@ -46,6 +46,8 @@ class Lfm2VlConfig(PreTrainedConfig):
46
46
  The hidden size of the multimodal projector.
47
47
  projector_bias (`bool`, *optional*, defaults to `True`):
48
48
  Whether to use bias in the multimodal projector.
49
+ projector_use_layernorm (`bool`, *optional*, defaults to `True`):
50
+ Whether to use layernorm in the multimodal projector.
49
51
  downsample_factor (`int`, *optional*, defaults to 2):
50
52
  The downsample_factor factor of the vision backbone.
51
53
  """
@@ -61,6 +63,7 @@ class Lfm2VlConfig(PreTrainedConfig):
61
63
  projector_hidden_act="gelu",
62
64
  projector_hidden_size=2560,
63
65
  projector_bias=True,
66
+ projector_use_layernorm=True,
64
67
  downsample_factor=2,
65
68
  **kwargs,
66
69
  ):
@@ -68,6 +71,7 @@ class Lfm2VlConfig(PreTrainedConfig):
68
71
  self.projector_hidden_act = projector_hidden_act
69
72
  self.projector_hidden_size = projector_hidden_size
70
73
  self.projector_bias = projector_bias
74
+ self.projector_use_layernorm = projector_use_layernorm
71
75
  self.downsample_factor = downsample_factor
72
76
 
73
77
  if isinstance(vision_config, dict):
@@ -41,7 +41,8 @@ class Lfm2VlMultiModalProjector(nn.Module):
41
41
  super().__init__()
42
42
  in_channels = config.vision_config.hidden_size * (config.downsample_factor**2)
43
43
  self.factor = config.downsample_factor
44
- self.layer_norm = nn.LayerNorm(in_channels)
44
+ self.use_layer_norm = config.projector_use_layernorm
45
+ self.layer_norm = nn.LayerNorm(in_channels) if config.projector_use_layernorm else None
45
46
  self.linear_1 = nn.Linear(
46
47
  in_channels,
47
48
  config.projector_hidden_size,
@@ -56,7 +57,8 @@ class Lfm2VlMultiModalProjector(nn.Module):
56
57
 
57
58
  def forward(self, image_features: torch.Tensor):
58
59
  image_features = self.pixel_unshuffle(image_features)
59
- image_features = self.layer_norm(image_features)
60
+ if self.use_layer_norm:
61
+ image_features = self.layer_norm(image_features)
60
62
  hidden_states = self.linear_1(image_features)
61
63
  hidden_states = self.act(hidden_states)
62
64
  hidden_states = self.linear_2(hidden_states)
@@ -448,6 +450,7 @@ class Lfm2VlForConditionalGeneration(Lfm2VlPreTrainedModel, GenerationMixin):
448
450
  attention_mask=None,
449
451
  cache_position=None,
450
452
  logits_to_keep=None,
453
+ is_first_iteration=False,
451
454
  **kwargs,
452
455
  ):
453
456
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -459,12 +462,15 @@ class Lfm2VlForConditionalGeneration(Lfm2VlPreTrainedModel, GenerationMixin):
459
462
  attention_mask=attention_mask,
460
463
  cache_position=cache_position,
461
464
  logits_to_keep=logits_to_keep,
465
+ is_first_iteration=is_first_iteration,
462
466
  **kwargs,
463
467
  )
464
468
 
465
- if cache_position[0] == 0:
466
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
467
- # Otherwise we need pixel values to be passed to model
469
+ if is_first_iteration or not kwargs.get("use_cache", True):
470
+ # Pixel values are used only in the first iteration if available
471
+ # In subsquent iterations, they are already merged with text and cached
472
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
473
+ # iteration with a question and cached system prompt (continue generate from cache)
468
474
  model_inputs["pixel_values"] = pixel_values
469
475
 
470
476
  return model_inputs
@@ -41,7 +41,8 @@ class Lfm2VlMultiModalProjector(nn.Module):
41
41
  super().__init__()
42
42
  in_channels = config.vision_config.hidden_size * (config.downsample_factor**2)
43
43
  self.factor = config.downsample_factor
44
- self.layer_norm = nn.LayerNorm(in_channels)
44
+ self.use_layer_norm = config.projector_use_layernorm
45
+ self.layer_norm = nn.LayerNorm(in_channels) if config.projector_use_layernorm else None
45
46
  self.linear_1 = nn.Linear(
46
47
  in_channels,
47
48
  config.projector_hidden_size,
@@ -56,7 +57,8 @@ class Lfm2VlMultiModalProjector(nn.Module):
56
57
 
57
58
  def forward(self, image_features: torch.Tensor):
58
59
  image_features = self.pixel_unshuffle(image_features)
59
- image_features = self.layer_norm(image_features)
60
+ if self.use_layer_norm:
61
+ image_features = self.layer_norm(image_features)
60
62
  hidden_states = self.linear_1(image_features)
61
63
  hidden_states = self.act(hidden_states)
62
64
  hidden_states = self.linear_2(hidden_states)