transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,219 @@
1
+ # coding=utf-8
2
+ # Copyright 2025 the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from dataclasses import dataclass
16
+ from typing import Any, Optional
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+ import torch.nn.functional as F
21
+
22
+ from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
23
+ from ...modeling_outputs import BaseModelOutputWithPooling, MaskedLMOutput
24
+ from ...utils import ModelOutput, auto_docstring, can_return_tuple
25
+ from ...utils.generic import check_model_inputs
26
+ from ..auto import AutoModel, AutoModelForImageClassification
27
+ from ..pe_audio_video.modeling_pe_audio_video import (
28
+ PeAudioVideoContrastiveHead,
29
+ PeAudioVideoEncoder,
30
+ PeAudioVideoEncoderPatchEmbedder,
31
+ PeAudioVideoPreTrainedModel,
32
+ )
33
+ from .configuration_pe_video import PeVideoConfig, PeVideoEncoderConfig
34
+
35
+
36
+ # TODO: not sure about the typing for text_model_output
37
+ @dataclass
38
+ # @auto_docstring
39
+ class PeVideoOutput(ModelOutput):
40
+ loss: Optional[torch.FloatTensor] = None
41
+ logits_video_text: Optional[torch.FloatTensor] = None
42
+ text_video_embeds: Optional[torch.FloatTensor] = None
43
+ video_embeds: Optional[torch.FloatTensor] = None
44
+ text_outputs: BaseModelOutputWithPooling = None
45
+ video_outputs: BaseModelOutputWithPooling = None
46
+
47
+ def to_tuple(self) -> tuple[Any]:
48
+ return tuple(
49
+ self[k] if k not in ["text_outputs", "video_outputs"] else getattr(self, k).to_tuple() for k in self.keys()
50
+ )
51
+
52
+
53
+ class PeVideoContrastiveHead(PeAudioVideoContrastiveHead): ...
54
+
55
+
56
+ class PeVideoEncoderPatchEmbedder(PeAudioVideoEncoderPatchEmbedder): ...
57
+
58
+
59
+ class PeVideoEncoderEmbedder(nn.Module):
60
+ def __init__(self, config: PeVideoEncoderConfig):
61
+ super().__init__()
62
+ self.vision_model = AutoModelForImageClassification.from_config(config.vision_config)
63
+ self.proj = nn.Linear(config.vision_config.num_labels, config.hidden_size, bias=False)
64
+ self.data_proj = nn.Linear(config.hidden_size, config.hidden_size)
65
+
66
+ def forward(
67
+ self,
68
+ pixel_values_videos: torch.Tensor,
69
+ padding_mask: Optional[torch.Tensor] = None,
70
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
71
+ input_shape = pixel_values_videos.shape
72
+
73
+ pixel_values_videos = pixel_values_videos.view(-1, *input_shape[2:])
74
+ vision_encoder_outputs = self.vision_model(pixel_values_videos)
75
+
76
+ logits = vision_encoder_outputs.logits.view(*input_shape[:2], -1)
77
+ logits = F.normalize(logits, dim=-1)
78
+
79
+ vision_features = self.proj(logits)
80
+ inputs_embeds = self.data_proj(vision_features)
81
+
82
+ return inputs_embeds, padding_mask
83
+
84
+
85
+ class PeVideoPreTrainedModel(PeAudioVideoPreTrainedModel):
86
+ base_model_prefix = "video_model"
87
+ main_input_name = "pixel_values_videos"
88
+
89
+
90
+ @auto_docstring(
91
+ custom_intro="""
92
+ The PeVideo Encoder model.
93
+ """
94
+ )
95
+ class PeVideoEncoder(PeAudioVideoEncoder):
96
+ base_model_prefix = "video_model.video_encoder"
97
+ main_input_name = "pixel_values_videos"
98
+
99
+ def __init__(self, config: PeVideoEncoderConfig):
100
+ super().__init__(config)
101
+ self.embedder = PeVideoEncoderEmbedder(config)
102
+
103
+ @can_return_tuple
104
+ @check_model_inputs
105
+ def forward(
106
+ self,
107
+ pixel_values_videos: torch.Tensor,
108
+ padding_mask_videos: Optional[torch.Tensor] = None,
109
+ **kwargs,
110
+ ) -> BaseModelOutputWithPooling:
111
+ inputs_embeds, padding_mask = self.embedder(pixel_values_videos, padding_mask=padding_mask_videos)
112
+ inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask)
113
+
114
+ if attention_mask is not None:
115
+ attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
116
+
117
+ position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0)
118
+ position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
119
+
120
+ hidden_states = inputs_embeds
121
+ for encoder_layer in self.layers[: self.config.num_hidden_layers]:
122
+ hidden_states = encoder_layer(
123
+ hidden_states,
124
+ attention_mask=attention_mask,
125
+ position_embeddings=position_embeddings,
126
+ **kwargs,
127
+ )
128
+
129
+ hidden_states = self.norm(hidden_states)
130
+ hidden_states = self.output(hidden_states)
131
+
132
+ return BaseModelOutputWithPooling(
133
+ last_hidden_state=hidden_states[:, 1:],
134
+ pooler_output=hidden_states[:, 0],
135
+ )
136
+
137
+
138
+ class PeVideoModel(PeVideoPreTrainedModel):
139
+ main_input_name = "input_ids"
140
+
141
+ def __init__(self, config: PeVideoConfig):
142
+ super().__init__(config)
143
+ self.text_model = AutoModel.from_config(config.text_config)
144
+ self.video_encoder = PeVideoEncoder(config.video_config)
145
+
146
+ self.text_video_head = PeVideoContrastiveHead(config.text_config.hidden_size, config.text_config.hidden_size)
147
+ self.video_head = PeVideoContrastiveHead(config.video_config.hidden_size, config.text_config.hidden_size)
148
+
149
+ self.text_video_logit_scale = nn.Parameter(torch.zeros(1))
150
+ self.text_video_logit_bias = nn.Parameter(torch.zeros(1))
151
+
152
+ self.post_init()
153
+
154
+ def get_text_features(self, input_ids, attention_mask=None):
155
+ # TODO: should it be named feature or embeds
156
+ text_outputs: MaskedLMOutput = self.text_model(
157
+ input_ids=input_ids,
158
+ attention_mask=attention_mask,
159
+ return_dict=True,
160
+ )
161
+
162
+ text_features = text_outputs.last_hidden_state
163
+ text_features = self.text_video_head(text_features)
164
+ return text_features
165
+
166
+ def get_video_features(self, pixel_values_videos, padding_mask_videos=None):
167
+ # TODO: should it be named feature or embeds
168
+ video_outputs: BaseModelOutputWithPooling = self.video_encoder(
169
+ pixel_values_videos=pixel_values_videos,
170
+ padding_mask_videos=padding_mask_videos,
171
+ return_dict=True,
172
+ )
173
+ video_features = self.video_head(video_outputs.pooler_output)
174
+ return video_features
175
+
176
+ @can_return_tuple
177
+ def forward(
178
+ self,
179
+ input_ids: torch.Tensor,
180
+ pixel_values_videos: torch.Tensor,
181
+ attention_mask: Optional[torch.Tensor] = None,
182
+ padding_mask_videos: Optional[torch.Tensor] = None,
183
+ return_loss: Optional[bool] = None,
184
+ **kwargs,
185
+ ) -> PeVideoOutput:
186
+ video_outputs: BaseModelOutputWithPooling = self.video_encoder(
187
+ pixel_values_videos=pixel_values_videos, padding_mask_videos=padding_mask_videos, **kwargs
188
+ )
189
+ kwargs["output_hidden_states"] = True
190
+ text_outputs: MaskedLMOutput = self.text_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
191
+
192
+ video_embeds = video_outputs.pooler_output
193
+ video_embeds = self.video_head(video_embeds)
194
+
195
+ text_video_embeds = text_outputs.hidden_states[-1][:, 0]
196
+ text_video_embeds = self.text_video_head(text_video_embeds)
197
+
198
+ logits_video_text = video_embeds @ text_video_embeds.T
199
+ logits_video_text = logits_video_text * self.text_video_logit_scale + self.text_video_logit_bias
200
+
201
+ loss = None
202
+ if return_loss:
203
+ labels = torch.eye(logits_video_text.shape[0], device=logits_video_text.device)
204
+ loss = -F.logsigmoid(labels * logits_video_text).sum() / logits_video_text.shape[0]
205
+
206
+ return PeVideoOutput(
207
+ logits_video_text=logits_video_text,
208
+ text_video_embeds=text_video_embeds,
209
+ video_embeds=video_embeds,
210
+ text_outputs=text_outputs,
211
+ video_outputs=video_outputs,
212
+ loss=loss,
213
+ )
214
+
215
+
216
+ __all__ = [
217
+ "PeVideoEncoder",
218
+ "PeVideoModel",
219
+ ]
@@ -0,0 +1,10 @@
1
+ from ...processing_utils import ProcessorMixin
2
+
3
+
4
+ class PeVideoProcessor(ProcessorMixin):
5
+ attributes = ["video_processor", "tokenizer"]
6
+ video_processor_class = "PeVideoVideoProcessor"
7
+ tokenizer_class = "AutoTokenizer"
8
+
9
+
10
+ __all__ = ["PeVideoProcessor"]
@@ -0,0 +1,66 @@
1
+ # coding=utf-8
2
+ # Copyright 2025 the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from typing import Optional, Union
16
+
17
+ import torch
18
+
19
+ from ...image_processing_utils import BatchFeature
20
+ from ...image_utils import PILImageResampling
21
+ from ...processing_utils import Unpack, VideosKwargs
22
+ from ...video_processing_utils import BaseVideoProcessor, VideoMetadata
23
+ from ...video_utils import VideoInput
24
+
25
+
26
+ class PeVideoVideoProcessor(BaseVideoProcessor):
27
+ resample = PILImageResampling.BILINEAR
28
+
29
+ def sample_frames(
30
+ self,
31
+ metadata: VideoMetadata,
32
+ num_frames: Optional[int] = None,
33
+ fps: Optional[Union[int, float]] = None,
34
+ **kwargs,
35
+ ):
36
+ if num_frames:
37
+ total_frames = metadata.total_num_frames
38
+ num_frames = num_frames if num_frames is not None else self.num_frames
39
+ assert num_frames is not None, "`num_frames` must be specified if `fixed_len_video == True`"
40
+ frame_idxs = [int(i * (total_frames - 1) / (num_frames - 1)) for i in range(num_frames)]
41
+ return torch.tensor(frame_idxs)
42
+ else:
43
+ return super().sample_frames(metadata, num_frames, fps, **kwargs)
44
+
45
+ def _preprocess(
46
+ self,
47
+ videos: VideoInput,
48
+ **kwargs: Unpack[VideosKwargs],
49
+ ) -> BatchFeature:
50
+ # Always set `return_tensors` to `None` since it won't pad variable length videos
51
+ # We'll handle this after we call the parent' method
52
+ return_tensors = kwargs.pop("return_tensors", None)
53
+ result = super()._preprocess(videos, **kwargs)
54
+ pixels = result.pixel_values_videos
55
+ data = {"pixel_values_videos": pixels}
56
+ if return_tensors:
57
+ lengths = torch.tensor([video.size(0) for video in pixels])
58
+ pixels = torch.nn.utils.rnn.pad_sequence(pixels, batch_first=True, padding_value=0.0)
59
+ data["pixel_values_videos"] = pixels
60
+ if lengths.unique().size(0) > 1:
61
+ mask = torch.arange(lengths.max())[None] < lengths[:, None]
62
+ data["padding_mask_videos"] = mask
63
+ return BatchFeature(data=data, tensor_type=return_tensors)
64
+
65
+
66
+ __all__ = ["PeVideoVideoProcessor"]
@@ -143,6 +143,7 @@ class PegasusConfig(PreTrainedConfig):
143
143
  self.use_cache = use_cache
144
144
  self.num_hidden_layers = encoder_layers
145
145
  self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
146
+
146
147
  super().__init__(
147
148
  pad_token_id=pad_token_id,
148
149
  eos_token_id=eos_token_id,
@@ -443,6 +443,8 @@ class PegasusPreTrainedModel(PreTrainedModel):
443
443
  super()._init_weights(module)
444
444
  if isinstance(module, PegasusSinusoidalPositionalEmbedding):
445
445
  init.copy_(module.weight, module.create_weight())
446
+ elif isinstance(module, PegasusForConditionalGeneration):
447
+ init.zeros_(module.final_logits_bias)
446
448
 
447
449
 
448
450
  class PegasusEncoder(PegasusPreTrainedModel):
@@ -518,6 +520,7 @@ class PegasusEncoder(PegasusPreTrainedModel):
518
520
  output_attentions=None,
519
521
  output_hidden_states=None,
520
522
  return_dict=None,
523
+ **kwargs,
521
524
  ):
522
525
  r"""
523
526
  Args:
@@ -695,6 +698,7 @@ class PegasusDecoder(PegasusPreTrainedModel):
695
698
  output_hidden_states=None,
696
699
  return_dict=None,
697
700
  cache_position=None,
701
+ **kwargs,
698
702
  ):
699
703
  r"""
700
704
  Args:
@@ -946,6 +950,7 @@ class PegasusModel(PegasusPreTrainedModel):
946
950
  output_hidden_states: Optional[bool] = None,
947
951
  return_dict: Optional[bool] = None,
948
952
  cache_position: Optional[torch.Tensor] = None,
953
+ **kwargs,
949
954
  ) -> Union[tuple, Seq2SeqModelOutput]:
950
955
  r"""
951
956
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1111,6 +1116,7 @@ class PegasusForConditionalGeneration(PegasusPreTrainedModel, GenerationMixin):
1111
1116
  output_hidden_states: Optional[bool] = None,
1112
1117
  return_dict: Optional[bool] = None,
1113
1118
  cache_position: Optional[torch.Tensor] = None,
1119
+ **kwargs,
1114
1120
  ) -> Union[tuple, Seq2SeqLMOutput]:
1115
1121
  r"""
1116
1122
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1216,6 +1222,7 @@ class PegasusDecoderWrapper(PegasusPreTrainedModel):
1216
1222
  def __init__(self, config):
1217
1223
  super().__init__(config)
1218
1224
  self.decoder = PegasusDecoder(config)
1225
+ self.post_init()
1219
1226
 
1220
1227
  def forward(self, *args, **kwargs):
1221
1228
  return self.decoder(*args, **kwargs)
@@ -1283,6 +1290,7 @@ class PegasusForCausalLM(PegasusPreTrainedModel, GenerationMixin):
1283
1290
  return_dict: Optional[bool] = None,
1284
1291
  cache_position: Optional[torch.LongTensor] = None,
1285
1292
  logits_to_keep: Union[int, torch.Tensor] = 0,
1293
+ **kwargs,
1286
1294
  ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
1287
1295
  r"""
1288
1296
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -14,6 +14,8 @@
14
14
  # limitations under the License.
15
15
  """Tokenization class for model PEGASUS."""
16
16
 
17
+ from typing import Optional, Union
18
+
17
19
  from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
18
20
  from tokenizers.models import Unigram
19
21
 
@@ -70,15 +72,17 @@ class PegasusTokenizer(TokenizersBackend):
70
72
  that uses the tokens 2 - 104 only for pretraining
71
73
  offset (`int`, *optional*, defaults to 103):
72
74
  Offset for additional special tokens.
73
- vocab (`dict`, *optional*):
74
- Custom vocabulary dictionary. If not provided, a blank vocabulary is initialized.
75
+ vocab (`str` or `list[tuple[str, float]]`, *optional*):
76
+ Custom vocabulary with `(token, score)` tuples. If not provided, a blank vocabulary is initialized.
75
77
  """
76
78
 
77
79
  vocab_files_names = VOCAB_FILES_NAMES
78
80
  model_input_names = ["input_ids", "attention_mask"]
81
+ model = Unigram
79
82
 
80
83
  def __init__(
81
84
  self,
85
+ vocab: Optional[Union[str, list[tuple[str, float]]]] = None,
82
86
  pad_token="<pad>",
83
87
  eos_token="</s>",
84
88
  unk_token="<unk>",
@@ -86,60 +90,27 @@ class PegasusTokenizer(TokenizersBackend):
86
90
  mask_token_sent="<mask_1>",
87
91
  additional_special_tokens=None,
88
92
  offset=103,
89
- vocab=None,
90
- vocab_file=None,
91
93
  **kwargs,
92
94
  ):
93
95
  self.offset = offset
94
- self.vocab_file = vocab_file
95
96
 
96
97
  if additional_special_tokens is None:
97
98
  additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else []
98
99
  additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
99
100
 
100
- if vocab is not None:
101
- # For Pegasus, insert special tokens at the beginning
102
- special_tokens_set = {pad_token, eos_token, mask_token_sent, mask_token, unk_token}
103
- special_tokens_set.update(additional_special_tokens)
104
-
105
- # Build special tokens in correct order
106
- _vocab_list = [
107
- (str(pad_token), 0.0),
108
- (str(eos_token), 0.0),
109
- ]
110
- if mask_token_sent:
111
- _vocab_list.append((str(mask_token_sent), 0.0))
112
- for token in additional_special_tokens:
113
- if token not in [pad_token, eos_token, mask_token_sent]:
114
- _vocab_list.append((str(token), 0.0))
115
- if mask_token not in [t for t, _ in _vocab_list]:
116
- _vocab_list.append((str(mask_token), 0.0))
117
- _vocab_list.append((str(unk_token), 0.0))
118
-
119
- # Filter out special tokens from main vocab and combine
120
- filtered_vocab = [(t, s) for t, s in vocab if t not in special_tokens_set]
121
- _vocab_list = _vocab_list + filtered_vocab
122
- else:
123
- _vocab_list = [(str(unk_token), 0.0)]
124
-
125
- self._vocab = {token: idx for idx, (token, _) in enumerate(_vocab_list)}
126
-
127
- self._tokenizer = Tokenizer(Unigram(vocab=_vocab_list, unk_id=self._vocab.get(str(unk_token), 0)))
101
+ if vocab is None:
102
+ vocab = [(str(unk_token), 0.0), (str(pad_token), 0.0), (str(eos_token), 0.0), (str(mask_token), 0.0)]
128
103
 
104
+ self._vocab = vocab
105
+ self._tokenizer = Tokenizer(Unigram(vocab=vocab, unk_id=self._vocab.index((str(unk_token), 0.0), 1)))
129
106
  self._tokenizer.normalizer = normalizers.Sequence(
130
107
  [normalizers.Replace(Regex(r"\n"), " "), normalizers.Replace(Regex(r" {2,}"), " ")]
131
108
  )
132
109
 
133
- self._tokenizer.post_processor = processors.TemplateProcessing(
134
- single=f"$A {eos_token}",
135
- pair=f"$A $B {eos_token}",
136
- special_tokens=[(str(eos_token), self._vocab.get(str(eos_token), 1))],
137
- )
138
-
139
- tokenizer_object = self._tokenizer
110
+ self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)
111
+ self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
140
112
 
141
113
  super().__init__(
142
- tokenizer_object=tokenizer_object,
143
114
  pad_token=pad_token,
144
115
  eos_token=eos_token,
145
116
  unk_token=unk_token,
@@ -149,9 +120,11 @@ class PegasusTokenizer(TokenizersBackend):
149
120
  additional_special_tokens=additional_special_tokens,
150
121
  **kwargs,
151
122
  )
152
-
153
- self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)
154
- self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
123
+ self._tokenizer.post_processor = processors.TemplateProcessing(
124
+ single=f"$A {eos_token}",
125
+ pair=f"$A $B {eos_token}",
126
+ special_tokens=[(str(eos_token), self.convert_tokens_to_ids(str(eos_token)))],
127
+ )
155
128
 
156
129
 
157
130
  __all__ = ["PegasusTokenizer"]
@@ -821,6 +821,7 @@ class PegasusXEncoder(PegasusXPreTrainedModel):
821
821
  output_attentions=None,
822
822
  output_hidden_states=None,
823
823
  return_dict=None,
824
+ **kwargs,
824
825
  ):
825
826
  r"""
826
827
  Args:
@@ -989,6 +990,7 @@ class PegasusXDecoder(PegasusXPreTrainedModel):
989
990
  output_hidden_states=None,
990
991
  return_dict=None,
991
992
  cache_position=None,
993
+ **kwargs,
992
994
  ):
993
995
  r"""
994
996
  Args:
@@ -1241,6 +1243,7 @@ class PegasusXModel(PegasusXPreTrainedModel):
1241
1243
  output_hidden_states: Optional[bool] = None,
1242
1244
  return_dict: Optional[bool] = None,
1243
1245
  cache_position: Optional[torch.Tensor] = None,
1246
+ **kwargs,
1244
1247
  ) -> Union[tuple, Seq2SeqModelOutput]:
1245
1248
  r"""
1246
1249
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1388,6 +1391,7 @@ class PegasusXForConditionalGeneration(PegasusXPreTrainedModel, GenerationMixin)
1388
1391
  output_hidden_states: Optional[bool] = None,
1389
1392
  return_dict: Optional[bool] = None,
1390
1393
  cache_position: Optional[torch.Tensor] = None,
1394
+ **kwargs,
1391
1395
  ) -> Union[tuple, Seq2SeqLMOutput]:
1392
1396
  r"""
1393
1397
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1472,6 +1476,7 @@ class PegasusXDecoderWrapper(PegasusXPreTrainedModel):
1472
1476
  def __init__(self, config):
1473
1477
  super().__init__(config)
1474
1478
  self.decoder = PegasusXDecoder(config)
1479
+ self.post_init()
1475
1480
 
1476
1481
  def forward(self, *args, **kwargs):
1477
1482
  return self.decoder(*args, **kwargs)
@@ -113,7 +113,6 @@ class PerceiverImageProcessorFast(BaseImageProcessorFast):
113
113
  processed_images_grouped[shape] = stacked_images
114
114
 
115
115
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
116
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
117
116
 
118
117
  return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
119
118
 
@@ -551,9 +551,13 @@ class PerceiverPreTrainedModel(PreTrainedModel):
551
551
  # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
552
552
  if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
553
553
  init.zeros_(module.weight[module.padding_idx])
554
- elif isinstance(module, nn.LayerNorm):
554
+ elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
555
555
  init.zeros_(module.bias)
556
556
  init.ones_(module.weight)
557
+ if getattr(module, "running_mean", None) is not None:
558
+ init.zeros_(module.running_mean)
559
+ init.ones_(module.running_var)
560
+ init.zeros_(module.num_batches_tracked)
557
561
 
558
562
 
559
563
  @auto_docstring(
@@ -615,6 +619,7 @@ class PerceiverModel(PerceiverPreTrainedModel):
615
619
  output_hidden_states: Optional[bool] = None,
616
620
  interpolate_pos_encoding: bool = False,
617
621
  return_dict: Optional[bool] = None,
622
+ **kwargs,
618
623
  ) -> Union[tuple, PerceiverModelOutput]:
619
624
  r"""
620
625
  inputs (`torch.FloatTensor`):
@@ -850,6 +855,7 @@ class PerceiverForMaskedLM(PerceiverPreTrainedModel):
850
855
  labels: Optional[torch.Tensor] = None,
851
856
  return_dict: Optional[bool] = None,
852
857
  input_ids: Optional[torch.Tensor] = None,
858
+ **kwargs,
853
859
  ) -> Union[tuple, PerceiverMaskedLMOutput]:
854
860
  r"""
855
861
  inputs (`torch.FloatTensor`):
@@ -975,6 +981,7 @@ class PerceiverForSequenceClassification(PerceiverPreTrainedModel):
975
981
  labels: Optional[torch.Tensor] = None,
976
982
  return_dict: Optional[bool] = None,
977
983
  input_ids: Optional[torch.Tensor] = None,
984
+ **kwargs,
978
985
  ) -> Union[tuple, PerceiverClassifierOutput]:
979
986
  r"""
980
987
  inputs (`torch.FloatTensor`):
@@ -1107,6 +1114,7 @@ class PerceiverForImageClassificationLearned(PerceiverPreTrainedModel):
1107
1114
  interpolate_pos_encoding: bool = False,
1108
1115
  return_dict: Optional[bool] = None,
1109
1116
  pixel_values: Optional[torch.Tensor] = None,
1117
+ **kwargs,
1110
1118
  ) -> Union[tuple, PerceiverClassifierOutput]:
1111
1119
  r"""
1112
1120
  inputs (`torch.FloatTensor`):
@@ -1229,6 +1237,7 @@ class PerceiverForImageClassificationFourier(PerceiverPreTrainedModel):
1229
1237
  labels: Optional[torch.Tensor] = None,
1230
1238
  return_dict: Optional[bool] = None,
1231
1239
  pixel_values: Optional[torch.Tensor] = None,
1240
+ **kwargs,
1232
1241
  ) -> Union[tuple, PerceiverClassifierOutput]:
1233
1242
  r"""
1234
1243
  inputs (`torch.FloatTensor`):
@@ -1350,6 +1359,7 @@ class PerceiverForImageClassificationConvProcessing(PerceiverPreTrainedModel):
1350
1359
  labels: Optional[torch.Tensor] = None,
1351
1360
  return_dict: Optional[bool] = None,
1352
1361
  pixel_values: Optional[torch.Tensor] = None,
1362
+ **kwargs,
1353
1363
  ) -> Union[tuple, PerceiverClassifierOutput]:
1354
1364
  r"""
1355
1365
  inputs (`torch.FloatTensor`):
@@ -1487,6 +1497,7 @@ class PerceiverForOpticalFlow(PerceiverPreTrainedModel):
1487
1497
  output_hidden_states: Optional[bool] = None,
1488
1498
  labels: Optional[torch.Tensor] = None,
1489
1499
  return_dict: Optional[bool] = None,
1500
+ **kwargs,
1490
1501
  ) -> Union[tuple, PerceiverClassifierOutput]:
1491
1502
  r"""
1492
1503
  inputs (`torch.FloatTensor`):
@@ -1695,6 +1706,7 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
1695
1706
  output_hidden_states: Optional[bool] = None,
1696
1707
  labels: Optional[torch.Tensor] = None,
1697
1708
  return_dict: Optional[bool] = None,
1709
+ **kwargs,
1698
1710
  ) -> Union[tuple, PerceiverClassifierOutput]:
1699
1711
  r"""
1700
1712
  inputs (`torch.FloatTensor`):
@@ -307,7 +307,6 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
307
307
  processed_images_grouped[shape] = stacked_images
308
308
  processed_images = reorder_images(processed_images_grouped, grouped_images_index)
309
309
  processed_images = [p[None] if p.ndim == 3 else p for p in processed_images] # add tiles dimension if needed
310
- processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
311
310
  return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
312
311
 
313
312
 
@@ -451,6 +451,7 @@ class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, Generati
451
451
  attention_mask=None,
452
452
  cache_position=None,
453
453
  logits_to_keep=None,
454
+ is_first_iteration=False,
454
455
  **kwargs,
455
456
  ):
456
457
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -462,12 +463,15 @@ class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, Generati
462
463
  attention_mask=attention_mask,
463
464
  cache_position=cache_position,
464
465
  logits_to_keep=logits_to_keep,
466
+ is_first_iteration=is_first_iteration,
465
467
  **kwargs,
466
468
  )
467
469
 
468
- if cache_position[0] == 0:
469
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
470
- # Otherwise we need pixel values to be passed to model
470
+ if is_first_iteration or not kwargs.get("use_cache", True):
471
+ # Pixel values are used only in the first iteration if available
472
+ # In subsquent iterations, they are already merged with text and cached
473
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
474
+ # iteration with a question and cached system prompt (continue generate from cache)
471
475
  model_inputs["pixel_values"] = pixel_values
472
476
  model_inputs["pixel_values_videos"] = pixel_values_videos
473
477
  return model_inputs
@@ -293,6 +293,7 @@ class PerceptionLMForConditionalGeneration(LlavaForConditionalGeneration):
293
293
  attention_mask=None,
294
294
  cache_position=None,
295
295
  logits_to_keep=None,
296
+ is_first_iteration=False,
296
297
  **kwargs,
297
298
  ):
298
299
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -304,12 +305,15 @@ class PerceptionLMForConditionalGeneration(LlavaForConditionalGeneration):
304
305
  attention_mask=attention_mask,
305
306
  cache_position=cache_position,
306
307
  logits_to_keep=logits_to_keep,
308
+ is_first_iteration=is_first_iteration,
307
309
  **kwargs,
308
310
  )
309
311
 
310
- if cache_position[0] == 0:
311
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
312
- # Otherwise we need pixel values to be passed to model
312
+ if is_first_iteration or not kwargs.get("use_cache", True):
313
+ # Pixel values are used only in the first iteration if available
314
+ # In subsquent iterations, they are already merged with text and cached
315
+ # NOTE: first iteration doesn't have to be prefill, it can be the first
316
+ # iteration with a question and cached system prompt (continue generate from cache)
313
317
  model_inputs["pixel_values"] = pixel_values
314
318
  model_inputs["pixel_values_videos"] = pixel_values_videos
315
319
  return model_inputs