transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -165,63 +165,103 @@ class Lfm2VlProcessor(ProcessorMixin):
165
165
  image_sizes: list[list[int]],
166
166
  use_image_special_tokens: bool,
167
167
  **images_kwargs,
168
- ):
169
- prompt_strings = []
168
+ ) -> list[str]:
169
+ use_thumbnail = images_kwargs.get("use_thumbnail", self.image_processor.use_thumbnail)
170
+ image_data = iter(zip(image_rows, image_cols, image_sizes))
170
171
 
171
- image_data = iter(zip(*[image_rows, image_cols, image_sizes]))
172
+ prompt_strings = []
172
173
  for sample_text, sample_images in zip(text, images):
173
- split_sample = sample_text.split(self.image_token)
174
- sample_text_with_image_tokens = ""
175
- for i, image in enumerate(sample_images):
176
- sample_text_with_image_tokens += split_sample[i]
177
- if use_image_special_tokens:
178
- sample_text_with_image_tokens += self.image_start_token
174
+ text_parts = sample_text.split(self.image_token)
175
+ result_parts = []
176
+
177
+ for i, _ in enumerate(sample_images):
178
+ result_parts.append(text_parts[i])
179
179
 
180
180
  rows, cols, image_size = next(image_data)
181
- num_thumbnail_tokens, num_tokens_per_tile = self._get_image_num_tokens(image_size, **images_kwargs)
182
-
183
- if rows > 1 or cols > 1:
184
- for row in range(rows):
185
- for col in range(cols):
186
- if use_image_special_tokens:
187
- sample_text_with_image_tokens += f"<|img_row_{row + 1}_col_{col + 1}|>"
188
- sample_text_with_image_tokens += self.image_token * num_tokens_per_tile
189
-
190
- if num_thumbnail_tokens > 0:
191
- if use_image_special_tokens:
192
- sample_text_with_image_tokens += self.image_thumbnail_token
193
- sample_text_with_image_tokens += self.image_token * num_thumbnail_tokens
194
- else:
195
- sample_text_with_image_tokens += self.image_token * num_thumbnail_tokens
181
+ tokens_per_tile, tokens_for_image = self._get_image_num_tokens(image_size, **images_kwargs)
182
+ image_tokens = self._build_image_tokens(
183
+ rows,
184
+ cols,
185
+ tokens_per_tile,
186
+ tokens_for_image,
187
+ use_thumbnail,
188
+ use_image_special_tokens,
189
+ )
190
+ result_parts.append(image_tokens)
196
191
 
197
- if use_image_special_tokens:
198
- sample_text_with_image_tokens += self.image_end_token
192
+ # Add remaining text after the last image
193
+ if len(sample_images) < len(text_parts):
194
+ result_parts.append(text_parts[-1])
199
195
 
200
- sample_text_with_image_tokens += split_sample[i + 1]
201
- prompt_strings.append(sample_text_with_image_tokens)
196
+ prompt_strings.append("".join(result_parts))
202
197
 
203
198
  return prompt_strings
204
199
 
200
+ def _build_image_tokens(
201
+ self,
202
+ rows: int,
203
+ cols: int,
204
+ tokens_per_tile: int,
205
+ tokens_for_image: int,
206
+ use_thumbnail: bool,
207
+ use_image_special_tokens: bool,
208
+ ) -> str:
209
+ """Build the expanded token string for a single image."""
210
+ parts = []
211
+
212
+ if use_image_special_tokens:
213
+ parts.append(self.image_start_token)
214
+
215
+ is_multi_tile = rows > 1 or cols > 1
216
+ if is_multi_tile:
217
+ for row in range(rows):
218
+ for col in range(cols):
219
+ if use_image_special_tokens:
220
+ parts.append(f"<|img_row_{row + 1}_col_{col + 1}|>")
221
+ parts.append(self.image_token * tokens_per_tile)
222
+
223
+ if use_thumbnail:
224
+ if use_image_special_tokens:
225
+ parts.append(self.image_thumbnail_token)
226
+ parts.append(self.image_token * tokens_for_image)
227
+ else:
228
+ parts.append(self.image_token * tokens_for_image)
229
+
230
+ if use_image_special_tokens:
231
+ parts.append(self.image_end_token)
232
+
233
+ return "".join(parts)
234
+
235
+ def _compute_tokens_per_tile(self, tile_size: int, encoder_patch_size: int, downsample_factor: int) -> int:
236
+ """Compute the number of tokens for a single tile."""
237
+ num_patches = tile_size // encoder_patch_size
238
+ downsampled_patches = math.ceil(num_patches / downsample_factor)
239
+ return downsampled_patches * downsampled_patches
240
+
241
+ def _compute_tokens_for_image(self, image_size: list[int], encoder_patch_size: int, downsample_factor: int) -> int:
242
+ """Compute the number of tokens for a resized image (used for single-tile or thumbnail)."""
243
+ image_height, image_width = image_size
244
+ patches_h = math.ceil((image_height // encoder_patch_size) / downsample_factor)
245
+ patches_w = math.ceil((image_width // encoder_patch_size) / downsample_factor)
246
+ return patches_h * patches_w
247
+
205
248
  def _get_image_num_tokens(self, image_size: list[int], **images_kwargs) -> tuple[int, int]:
249
+ """
250
+ Compute token counts for image processing.
251
+
252
+ Returns:
253
+ tuple[int, int]: (tokens_per_tile, tokens_for_image)
254
+ - tokens_per_tile: tokens for each tile in multi-tile mode
255
+ - tokens_for_image: tokens for the resized image (single-tile) or thumbnail (multi-tile)
256
+ """
206
257
  tile_size = images_kwargs.get("tile_size", self.image_processor.tile_size)
207
258
  downsample_factor = images_kwargs.get("downsample_factor", self.image_processor.downsample_factor)
208
259
  encoder_patch_size = images_kwargs.get("encoder_patch_size", self.image_processor.encoder_patch_size)
209
- use_thumbnail = images_kwargs.get("use_thumbnail", self.image_processor.use_thumbnail)
210
-
211
- thumbnail_tokens = 0
212
- if use_thumbnail:
213
- image_height, image_width = image_size
214
- num_patches_height = image_height // encoder_patch_size
215
- num_patches_width = image_width // encoder_patch_size
216
- dwn_num_patches_height = math.ceil(num_patches_height / downsample_factor)
217
- dwn_num_patches_width = math.ceil(num_patches_width / downsample_factor)
218
- thumbnail_tokens = dwn_num_patches_height * dwn_num_patches_width
219
260
 
220
- num_patches_tile = tile_size // encoder_patch_size
221
- dwn_num_patches_tile = math.ceil(num_patches_tile / downsample_factor)
222
- tile_tokens = dwn_num_patches_tile * dwn_num_patches_tile
261
+ tokens_per_tile = self._compute_tokens_per_tile(tile_size, encoder_patch_size, downsample_factor)
262
+ tokens_for_image = self._compute_tokens_for_image(image_size, encoder_patch_size, downsample_factor)
223
263
 
224
- return thumbnail_tokens, tile_tokens
264
+ return tokens_per_tile, tokens_for_image
225
265
 
226
266
  def batch_decode(self, *args, **kwargs):
227
267
  """
@@ -174,9 +174,8 @@ class LightGlueImageProcessorFast(BaseImageProcessorFast):
174
174
  stacked_pairs = [torch.stack(pair, dim=0) for pair in image_pairs]
175
175
 
176
176
  # Return in same format as slow processor
177
- image_pairs = torch.stack(stacked_pairs, dim=0) if return_tensors else stacked_pairs
178
177
 
179
- return BatchFeature(data={"pixel_values": image_pairs})
178
+ return BatchFeature(data={"pixel_values": stacked_pairs}, tensor_type=return_tensors)
180
179
 
181
180
  def post_process_keypoint_matching(
182
181
  self,
@@ -27,6 +27,7 @@ from torch import nn
27
27
  from torch.nn.utils.rnn import pad_sequence
28
28
 
29
29
  from ...activations import ACT2FN
30
+ from ...integrations import use_kernelized_func
30
31
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
31
32
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
32
33
  from ...processing_utils import Unpack
@@ -174,6 +175,7 @@ def eager_attention_forward(
174
175
  return attn_output, attn_weights
175
176
 
176
177
 
178
+ @use_kernelized_func(apply_rotary_pos_emb)
177
179
  class LightGlueAttention(nn.Module):
178
180
  """Multi-headed attention from 'Attention Is All You Need' paper"""
179
181
 
@@ -199,7 +201,6 @@ class LightGlueAttention(nn.Module):
199
201
  self.o_proj = nn.Linear(
200
202
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
201
203
  )
202
- self.rotary_fn = apply_rotary_pos_emb
203
204
 
204
205
  def forward(
205
206
  self,
@@ -870,6 +871,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel):
870
871
  labels: Optional[torch.LongTensor] = None,
871
872
  output_attentions: Optional[bool] = None,
872
873
  output_hidden_states: Optional[bool] = None,
874
+ **kwargs,
873
875
  ) -> Union[tuple, "LightGlueKeypointMatchingOutput"]:
874
876
  loss = None
875
877
  if labels is not None:
@@ -927,6 +927,7 @@ class LightGlueForKeypointMatching(LightGluePreTrainedModel):
927
927
  labels: Optional[torch.LongTensor] = None,
928
928
  output_attentions: Optional[bool] = None,
929
929
  output_hidden_states: Optional[bool] = None,
930
+ **kwargs,
930
931
  ) -> Union[tuple, "LightGlueKeypointMatchingOutput"]:
931
932
  loss = None
932
933
  if labels is not None:
@@ -21,6 +21,7 @@ import torch
21
21
  from torch import nn
22
22
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
23
23
 
24
+ from ... import initialization as init
24
25
  from ...activations import ACT2FN
25
26
  from ...modeling_layers import GradientCheckpointingLayer
26
27
  from ...modeling_outputs import (
@@ -279,11 +280,9 @@ class LiltSelfAttention(nn.Module):
279
280
  new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
280
281
  context_layer = context_layer.view(*new_context_layer_shape)
281
282
 
282
- outputs = (
283
- ((context_layer, layout_context_layer), attention_probs)
284
- if output_attentions
285
- else ((context_layer, layout_context_layer),)
286
- )
283
+ outputs = (context_layer, layout_context_layer)
284
+ if output_attentions:
285
+ outputs = outputs + (attention_probs,)
287
286
 
288
287
  return outputs
289
288
 
@@ -327,9 +326,9 @@ class LiltAttention(nn.Module):
327
326
  attention_mask,
328
327
  output_attentions,
329
328
  )
330
- attention_output = self.output(self_outputs[0][0], hidden_states)
331
- layout_attention_output = self.layout_output(self_outputs[0][1], layout_inputs)
332
- outputs = ((attention_output, layout_attention_output),) + self_outputs[1:] # add attentions if we output them
329
+ attention_output = self.output(self_outputs[0], hidden_states)
330
+ layout_attention_output = self.layout_output(self_outputs[1], layout_inputs)
331
+ outputs = (attention_output, layout_attention_output) + self_outputs[2:] # add attentions if we output them
333
332
  return outputs
334
333
 
335
334
 
@@ -395,10 +394,10 @@ class LiltLayer(GradientCheckpointingLayer):
395
394
  attention_mask,
396
395
  output_attentions=output_attentions,
397
396
  )
398
- attention_output = self_attention_outputs[0][0]
399
- layout_attention_output = self_attention_outputs[0][1]
397
+ attention_output = self_attention_outputs[0]
398
+ layout_attention_output = self_attention_outputs[1]
400
399
 
401
- outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
400
+ outputs = self_attention_outputs[2:] # add self attentions if we output attention weights
402
401
 
403
402
  layer_output = apply_chunking_to_forward(
404
403
  self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
@@ -406,7 +405,7 @@ class LiltLayer(GradientCheckpointingLayer):
406
405
  layout_layer_output = apply_chunking_to_forward(
407
406
  self.layout_feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, layout_attention_output
408
407
  )
409
- outputs = ((layer_output, layout_layer_output),) + outputs
408
+ outputs = (layer_output, layout_layer_output) + outputs
410
409
 
411
410
  return outputs
412
411
 
@@ -451,11 +450,11 @@ class LiltEncoder(nn.Module):
451
450
  output_attentions,
452
451
  )
453
452
 
454
- hidden_states = layer_outputs[0][0]
455
- layout_inputs = layer_outputs[0][1]
453
+ hidden_states = layer_outputs[0]
454
+ layout_inputs = layer_outputs[1]
456
455
 
457
456
  if output_attentions:
458
- all_self_attentions = all_self_attentions + (layer_outputs[1],)
457
+ all_self_attentions = all_self_attentions + (layer_outputs[2],)
459
458
 
460
459
  if output_hidden_states:
461
460
  all_hidden_states = all_hidden_states + (hidden_states,)
@@ -500,6 +499,11 @@ class LiltPreTrainedModel(PreTrainedModel):
500
499
  supports_gradient_checkpointing = True
501
500
  _no_split_modules = []
502
501
 
502
+ def _init_weights(self, module):
503
+ super()._init_weights(module)
504
+ if isinstance(module, LiltTextEmbeddings):
505
+ init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
506
+
503
507
 
504
508
  @auto_docstring
505
509
  class LiltModel(LiltPreTrainedModel):
@@ -538,6 +542,7 @@ class LiltModel(LiltPreTrainedModel):
538
542
  output_attentions: Optional[bool] = None,
539
543
  output_hidden_states: Optional[bool] = None,
540
544
  return_dict: Optional[bool] = None,
545
+ **kwargs,
541
546
  ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPooling]:
542
547
  r"""
543
548
  bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
@@ -665,6 +670,7 @@ class LiltForSequenceClassification(LiltPreTrainedModel):
665
670
  output_attentions: Optional[bool] = None,
666
671
  output_hidden_states: Optional[bool] = None,
667
672
  return_dict: Optional[bool] = None,
673
+ **kwargs,
668
674
  ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
669
675
  r"""
670
676
  bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
@@ -780,6 +786,7 @@ class LiltForTokenClassification(LiltPreTrainedModel):
780
786
  output_attentions: Optional[bool] = None,
781
787
  output_hidden_states: Optional[bool] = None,
782
788
  return_dict: Optional[bool] = None,
789
+ **kwargs,
783
790
  ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
784
791
  r"""
785
792
  bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
@@ -897,6 +904,7 @@ class LiltForQuestionAnswering(LiltPreTrainedModel):
897
904
  output_attentions: Optional[bool] = None,
898
905
  output_hidden_states: Optional[bool] = None,
899
906
  return_dict: Optional[bool] = None,
907
+ **kwargs,
900
908
  ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
901
909
  r"""
902
910
  bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
@@ -26,7 +26,7 @@ from torch import nn
26
26
  from ...activations import ACT2FN
27
27
  from ...cache_utils import Cache, DynamicCache
28
28
  from ...generation import GenerationMixin
29
- from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
29
+ from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
30
30
  from ...masking_utils import create_causal_mask
31
31
  from ...modeling_layers import (
32
32
  GenericForQuestionAnswering,
@@ -42,7 +42,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
42
42
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
43
43
  from ...processing_utils import Unpack
44
44
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
45
- from ...utils.generic import check_model_inputs
45
+ from ...utils.generic import check_model_inputs, maybe_autocast
46
46
  from .configuration_llama import LlamaConfig
47
47
 
48
48
 
@@ -87,7 +87,7 @@ class LlamaRotaryEmbedding(nn.Module):
87
87
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
88
88
 
89
89
  self.register_buffer("inv_freq", inv_freq, persistent=False)
90
- self.original_inv_freq = inv_freq
90
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
91
91
 
92
92
  @staticmethod
93
93
  def compute_default_rope_parameters(
@@ -126,7 +126,7 @@ class LlamaRotaryEmbedding(nn.Module):
126
126
  position_ids_expanded = position_ids[:, None, :].float()
127
127
 
128
128
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
129
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
129
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
130
130
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
131
131
  emb = torch.cat((freqs, freqs), dim=-1)
132
132
  cos = emb.cos() * self.attention_scaling
@@ -224,6 +224,7 @@ def eager_attention_forward(
224
224
  return attn_output, attn_weights
225
225
 
226
226
 
227
+ @use_kernelized_func(apply_rotary_pos_emb)
227
228
  class LlamaAttention(nn.Module):
228
229
  """Multi-headed attention from 'Attention Is All You Need' paper"""
229
230
 
@@ -249,7 +250,6 @@ class LlamaAttention(nn.Module):
249
250
  self.o_proj = nn.Linear(
250
251
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
251
252
  )
252
- self.rotary_fn = apply_rotary_pos_emb
253
253
 
254
254
  def forward(
255
255
  self,
@@ -12,11 +12,12 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
+ from typing import Optional, Union
15
16
 
16
- from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers
17
+ from tokenizers import Tokenizer, decoders, pre_tokenizers
17
18
  from tokenizers.models import BPE
18
19
 
19
- from ...tokenization_utils_base import _get_prepend_scheme, generate_merges
20
+ from ...tokenization_utils_base import _get_prepend_scheme
20
21
  from ...tokenization_utils_tokenizers import TokenizersBackend
21
22
  from ...utils import logging
22
23
 
@@ -61,6 +62,10 @@ class LlamaTokenizer(TokenizersBackend):
61
62
  refer to this superclass for more information regarding those methods.
62
63
 
63
64
  Args:
65
+ vocab (`str`, `dict` or `list`, *optional*):
66
+ Path to the vocabulary file, a dictionary or a list of tokens.
67
+ merges (`str` or `list`, *optional*):
68
+ Path to the merges file or a list of merges.
64
69
  clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
65
70
  Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
66
71
  extra spaces.
@@ -84,42 +89,32 @@ class LlamaTokenizer(TokenizersBackend):
84
89
  vocab_files_names = VOCAB_FILES_NAMES
85
90
  padding_side = "left"
86
91
  model_input_names = ["input_ids", "attention_mask"]
92
+ model = BPE
87
93
 
88
94
  def __init__(
89
95
  self,
96
+ vocab: Optional[Union[str, dict, list]] = None,
97
+ merges: Optional[Union[str, list]] = None,
90
98
  clean_up_tokenization_spaces=False,
91
99
  unk_token="<unk>",
92
100
  bos_token="<s>",
93
101
  eos_token="</s>",
94
- add_bos_token=True,
95
- add_eos_token=False,
96
102
  use_default_system_prompt=False,
97
103
  legacy=False,
98
104
  add_prefix_space=None,
99
- vocab=None,
100
- merges=None,
101
105
  **kwargs,
102
106
  ):
103
107
  self.add_prefix_space = add_prefix_space if add_prefix_space is not None else True
104
-
105
- if vocab is not None:
106
- self._vocab = (
107
- {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
108
- )
109
- else:
108
+ self.legacy = legacy
109
+ self._vocab = vocab
110
+ if vocab is None:
110
111
  self._vocab = {
111
112
  str(unk_token): 0,
112
113
  str(bos_token): 1,
113
114
  str(eos_token): 2,
114
115
  }
115
116
 
116
- special_tokens = {str(eos_token), str(bos_token), str(unk_token)}
117
-
118
- filtered_vocab = {t: i for t, i in self._vocab.items() if t not in special_tokens}
119
- if merges is not None:
120
- self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
121
- else:
122
- self._merges = generate_merges(filtered_vocab)
117
+ self._merges = merges or []
123
118
  self._tokenizer = Tokenizer(
124
119
  BPE(vocab=self._vocab, merges=self._merges, fuse_unk=True, byte_fallback=True, dropout=None)
125
120
  )
@@ -138,40 +133,17 @@ class LlamaTokenizer(TokenizersBackend):
138
133
  sequence += [decoders.Strip(content=" ", left=1)]
139
134
 
140
135
  self._tokenizer.decoder = decoders.Sequence(sequence)
141
- tokenizer_object = self._tokenizer
142
-
136
+ self.use_default_system_prompt = use_default_system_prompt
143
137
  super().__init__(
144
- tokenizer_object=tokenizer_object,
145
138
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
146
139
  unk_token=unk_token,
147
140
  bos_token=bos_token,
148
141
  eos_token=eos_token,
149
- add_bos_token=add_bos_token,
150
- add_eos_token=add_eos_token,
151
142
  use_default_system_prompt=use_default_system_prompt,
152
143
  add_prefix_space=add_prefix_space,
153
144
  **kwargs,
154
145
  )
155
146
 
156
- self._add_bos_token = add_bos_token
157
- self._add_eos_token = add_eos_token
158
- self.use_default_system_prompt = use_default_system_prompt
159
-
160
- self._post_init()
161
-
162
- def _post_init(self):
163
- """Post-initialization setup that needs to run after _tokenizer is set."""
164
- # Only set pre_tokenizer/normalizer for Llama-3 style tokenizers (use Sequence)
165
- pre_tok = self._tokenizer.pre_tokenizer
166
- if pre_tok is None or type(pre_tok).__name__ != "Sequence":
167
- self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
168
- replacement="▁", prepend_scheme="first", split=False
169
- )
170
- self._tokenizer.normalizer = None
171
- self.add_tokens([AddedToken(token, special=True) for token in self.all_special_tokens])
172
- super()._post_init()
173
- self.update_post_processor()
174
-
175
147
 
176
148
  __all__ = ["LlamaTokenizer", "LlamaTokenizerFast"]
177
149
 
@@ -419,10 +419,9 @@ class Llama4ImageProcessorFast(BaseImageProcessorFast):
419
419
  )
420
420
  grouped_processed_images[shape] = torch.cat([processed_images, global_tiles.unsqueeze(1)], dim=1)
421
421
  processed_images = reorder_images(grouped_processed_images, grouped_images_index)
422
- aspect_ratios_list = reorder_images(grouped_aspect_ratios, grouped_images_index)
422
+ aspect_ratios = reorder_images(grouped_aspect_ratios, grouped_images_index)
423
423
 
424
424
  processed_images = torch.cat(processed_images, dim=0) if return_tensors else processed_images
425
- aspect_ratios = torch.stack(aspect_ratios_list, dim=0) if return_tensors else aspect_ratios_list
426
425
  return BatchFeature(
427
426
  data={"pixel_values": processed_images, "aspect_ratios": aspect_ratios}, tensor_type=return_tensors
428
427
  )
@@ -40,7 +40,7 @@ from ...modeling_rope_utils import (
40
40
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
41
41
  from ...processing_utils import Unpack
42
42
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
43
- from ...utils.generic import check_model_inputs
43
+ from ...utils.generic import check_model_inputs, maybe_autocast
44
44
  from .configuration_llama4 import Llama4Config, Llama4TextConfig
45
45
 
46
46
 
@@ -188,7 +188,7 @@ class Llama4TextRotaryEmbedding(nn.Module):
188
188
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
189
189
 
190
190
  self.register_buffer("inv_freq", inv_freq, persistent=False)
191
- self.original_inv_freq = inv_freq
191
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
192
192
 
193
193
  @staticmethod
194
194
  def compute_default_rope_parameters(
@@ -228,7 +228,7 @@ class Llama4TextRotaryEmbedding(nn.Module):
228
228
  position_ids_expanded = position_ids[:, None, :].float()
229
229
 
230
230
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
231
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
231
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
232
232
  freqs = (inv_freq_expanded.to(x.device) @ position_ids_expanded).transpose(1, 2)
233
233
  freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # Convert to complex representation
234
234
  freqs_cis = freqs_cis * self.attention_scaling
@@ -1072,6 +1072,7 @@ class Llama4VisionModel(Llama4PreTrainedModel):
1072
1072
  output_attentions: Optional[bool] = None,
1073
1073
  output_hidden_states: Optional[bool] = None,
1074
1074
  return_dict: Optional[bool] = None,
1075
+ **kwargs,
1075
1076
  ) -> Union[BaseModelOutput, tuple[torch.Tensor, ...]]:
1076
1077
  r"""
1077
1078
 
@@ -1386,6 +1387,7 @@ class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
1386
1387
  attention_mask=None,
1387
1388
  cache_position=None,
1388
1389
  logits_to_keep=None,
1390
+ is_first_iteration=False,
1389
1391
  **kwargs,
1390
1392
  ):
1391
1393
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -1397,12 +1399,15 @@ class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
1397
1399
  attention_mask=attention_mask,
1398
1400
  cache_position=cache_position,
1399
1401
  logits_to_keep=logits_to_keep,
1402
+ is_first_iteration=is_first_iteration,
1400
1403
  **kwargs,
1401
1404
  )
1402
1405
 
1403
- if cache_position[0] == 0:
1404
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
1405
- # Otherwise we need pixel values to be passed to model
1406
+ if is_first_iteration or not kwargs.get("use_cache", True):
1407
+ # Pixel values are used only in the first iteration if available
1408
+ # In subsquent iterations, they are already merged with text and cached
1409
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
1410
+ # iteration with a question and cached system prompt (continue generate from cache)
1406
1411
  model_inputs["pixel_values"] = pixel_values
1407
1412
 
1408
1413
  return model_inputs
@@ -149,7 +149,6 @@ class LlavaImageProcessorFast(BaseImageProcessorFast):
149
149
  processed_images_grouped[shape] = stacked_images
150
150
 
151
151
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
152
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
153
152
 
154
153
  return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
155
154
 
@@ -202,10 +202,11 @@ class LlavaModel(LlavaPreTrainedModel):
202
202
  image_features = self.multi_modal_projector(selected_image_feature)
203
203
 
204
204
  if "image_sizes" in kwargs:
205
- split_sizes = [
206
- (height // self.vision_tower.patch_size) * (width // self.vision_tower.patch_size)
207
- for height, width in kwargs["image_sizes"]
208
- ]
205
+ split_sizes = (
206
+ (torch.as_tensor(kwargs["image_sizes"], device=image_features.device) // self.vision_tower.patch_size)
207
+ .prod(dim=-1)
208
+ .tolist()
209
+ )
209
210
  image_features = torch.split(image_features.squeeze(0), split_sizes)
210
211
  else:
211
212
  image_features = list(image_features)
@@ -437,6 +438,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
437
438
  attention_mask=None,
438
439
  cache_position=None,
439
440
  logits_to_keep=None,
441
+ is_first_iteration=False,
440
442
  **kwargs,
441
443
  ):
442
444
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -448,12 +450,15 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
448
450
  attention_mask=attention_mask,
449
451
  cache_position=cache_position,
450
452
  logits_to_keep=logits_to_keep,
453
+ is_first_iteration=is_first_iteration,
451
454
  **kwargs,
452
455
  )
453
456
 
454
- if cache_position[0] == 0:
455
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
456
- # Otherwise we need pixel values to be passed to model
457
+ if is_first_iteration or not kwargs.get("use_cache", True):
458
+ # Pixel values are used only in the first iteration if available
459
+ # In subsquent iterations, they are already merged with text and cached
460
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
461
+ # iteration with a question and cached system prompt (continue generate from cache)
457
462
  model_inputs["pixel_values"] = pixel_values
458
463
 
459
464
  return model_inputs
@@ -260,7 +260,6 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast):
260
260
 
261
261
  if do_pad:
262
262
  processed_images = self._pad_for_batching(processed_images)
263
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
264
263
  return BatchFeature(
265
264
  data={"pixel_values": processed_images, "image_sizes": image_sizes}, tensor_type=return_tensors
266
265
  )
@@ -692,6 +692,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
692
692
  attention_mask=None,
693
693
  cache_position=None,
694
694
  logits_to_keep=None,
695
+ is_first_iteration=False,
695
696
  **kwargs,
696
697
  ):
697
698
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -703,12 +704,15 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
703
704
  attention_mask=attention_mask,
704
705
  cache_position=cache_position,
705
706
  logits_to_keep=logits_to_keep,
707
+ is_first_iteration=is_first_iteration,
706
708
  **kwargs,
707
709
  )
708
710
 
709
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
710
- # Otherwise we need pixel values to be passed to model
711
- if cache_position[0] == 0:
711
+ # Pixel values are used only in the first iteration if available
712
+ # In subsquent iterations, they are already merged with text and cached
713
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
714
+ # iteration with a question and cached system prompt (continue generate from cache)
715
+ if is_first_iteration or not kwargs.get("use_cache", True):
712
716
  model_inputs["pixel_values"] = pixel_values
713
717
  model_inputs["image_sizes"] = image_sizes
714
718